RetroArch/console/rgl/src/ps3/rgl_ps3_raster.cpp

/*  RetroArch - A frontend for libretro.
 *  RGL - An OpenGL subset wrapper library.
 *  Copyright (C) 2010-2012 - Hans-Kristian Arntzen
 *  Copyright (C) 2011-2012 - Daniel De Matteis
 *
 *  RetroArch is free software: you can redistribute it and/or modify it under the terms
 *  of the GNU General Public License as published by the Free Software Found-
 *  ation, either version 3 of the License, or (at your option) any later version.
 *
 *  RetroArch is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 *  without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 *  PURPOSE.  See the GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License along with RetroArch.
 *  If not, see <http://www.gnu.org/licenses/>.
 */

#include "../rgl.h"

#include "include/GmmAlloc.h"
#include "include/rgl-constants.h"
#include "include/rgl-typedefs.h"
#include "include/rgl-externs.h"
#include "include/rgl-inline.h"

#include <Cg/cg.h>
#include <Cg/CgCommon.h>
#include <Cg/cgBinary.h>

#include <RGL/platform.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

using namespace cell::Gcm;

/*============================================================
  MEMORY MANAGER
  ============================================================ */

#if RGL_ENDIAN == RGL_BIG_ENDIAN
#define ENDIAN_32(X, F) ((F) ? endianSwapWord(X) : (X))
#else
#define ENDIAN_32(X, F) (X)
#endif

int _parameterAlloc = 0;
int _ucodeAlloc = 0;

int rglGetTypeResource( _CGprogram* program, unsigned short typeIndex, short *resourceIndex );
int rglGetTypeResourceID( _CGprogram* program, unsigned short typeIndex );
int rglGetTypeResourceRegisterCountVP( _CGprogram* program, short resourceIndex, int resourceCount, unsigned short *resource );

static void setAttribConstantIndex( CgRuntimeParameter* __restrict ptr, const void* __restrict v, const int ) // index
{
   // XXX not sure why it was testing unused res
   _CGprogram *program = ptr->program;
   const CgParameterResource *parameterResource = rglGetParameterResource( program, ptr->parameterEntry );
   GLuint index = parameterResource->resource - CG_ATTR0;
   float * f = ( float* ) v;
   rglVertexAttrib4fNV( index, f[0], f[1], f[2], f[3] );
}

void rglPlatformSetVertexRegister4fv( unsigned int reg, const float * __restrict v )
{
   // save to shared memory for context restore after flip
   __builtin_memcpy( rglGetGcmDriver()->sharedVPConstants + reg*4*sizeof( float ), v, 4*sizeof( float ) );

   GCM_FUNC( cellGcmSetVertexProgramParameterBlock, reg, 1, v );
}

// endian swapping of the fragment uniforms, if necessary
#if RGL_ENDIAN == RGL_BIG_ENDIAN
#define SWAP_IF_BIG_ENDIAN(arg) endianSwapWordByHalf(arg)
#elif RGL_ENDIAN == RGL_LITTLE_ENDIAN
#define SWAP_IF_BIG_ENDIAN(arg) arg
#else
#error include missing for endianness
#endif

//here ec has been advanced and is already on top of the embedded constant count
template<int SIZE> inline static void swapandsetfp( int ucodeSize, unsigned int loadProgramId, unsigned int loadProgramOffset, unsigned short *ec, const unsigned int   * __restrict v )
{
   //unsigned int v2[4];
   //for (long i=0; i < SIZE; ++i)
   //{
   //	v2[i] = SWAP_IF_BIG_ENDIAN(v[i]);
   //}
   GCM_FUNC( cellGcmSetTransferLocation, CELL_GCM_LOCATION_LOCAL );
   unsigned short count = *( ec++ );
   for ( unsigned long offsetIndex = 0; offsetIndex < count; ++offsetIndex )
   {
      void *pointer=NULL;
      const int paddedSIZE = (SIZE + 1) & ~1; // even width only
      GCM_FUNC( cellGcmSetInlineTransferPointer, gmmIdToOffset( loadProgramId ) + loadProgramOffset + *( ec++ ), paddedSIZE, &pointer);
      float *fp = (float*)pointer;
      float *src = (float*)v;
      for (uint32_t j=0; j<SIZE;j++)
      {
         *fp = cellGcmSwap16Float32(*src);
         fp++;src++;
      }
   }

}

template<int SIZE> static void setVectorTypefp( CgRuntimeParameter* __restrict ptr, const void* __restrict v )
{
   float * __restrict  f = ( float* )v;
   float * __restrict  data = ( float* )ptr->pushBufferPointer;/*(float*)ptr->offset*;*/
   for ( long i = 0; i < SIZE; ++i ) //TODO: ced: find out if this loop for the get or for the reset in a future use of the same shader or just for the alignment???
      data[i] = f[i];
   _CGprogram *program = ptr->program;

   CgParameterResource *parameterResource = rglGetParameterResource( ptr->program, ptr->parameterEntry );
   unsigned short resource = parameterResource->resource;
   unsigned short *ec = ( unsigned short * )( ptr->program->resources ) + resource + 1;//+1 to skip the register
   if ( RGL_LIKELY( *ec ) )
   {
      swapandsetfp<SIZE>( program->header.instructionCount*16, program->loadProgramId, program->loadProgramOffset, ec, ( unsigned int * )data );
   }
}

template<int SIZE> static void setVectorTypeSharedfpIndex( CgRuntimeParameter* __restrict ptr, const void* __restrict v, const int /*index*/ )
{
   RGLcontext * LContext = _CurrentContext;
   rglGcmDriver *driver = rglGetGcmDriver();
   const CgParameterResource *parameterResource = rglGetParameterResource( ptr->program, ptr->parameterEntry );
   unsigned short resource = parameterResource->resource;
   unsigned short sharedResource = *(( unsigned short * )( ptr->program->resources ) + resource );
   const unsigned int * __restrict vi = ( const unsigned int* )v;

   GLuint dstVidOffset = gmmIdToOffset( driver->sharedFPConstantsId ) + sharedResource * 16;
   unsigned int values[4];
   values[0] = SWAP_IF_BIG_ENDIAN( vi[0] );
   values[1] = ( 1 < SIZE ) ? SWAP_IF_BIG_ENDIAN( vi[1] ) : 0;
   values[2] = ( 2 < SIZE ) ? SWAP_IF_BIG_ENDIAN( vi[2] ) : 0;
   values[3] = ( 3 < SIZE ) ? SWAP_IF_BIG_ENDIAN( vi[3] ) : 0;
   GCM_FUNC( cellGcmInlineTransfer, dstVidOffset, values, 4, 0 );

   LContext->needValidate |= RGL_VALIDATE_FRAGMENT_SHARED_CONSTANTS;
   // XXX we don't care about 32bit wrapping, do we ?
   ++LContext->LastFPConstantModification;
}

template<int SIZE> static void setVectorTypeSharedfpIndexArray( CgRuntimeParameter* __restrict ptr, const void* __restrict v, const int index )
{
   RGLcontext * LContext = _CurrentContext;
   rglGcmDriver *driver = rglGetGcmDriver();
   const CgParameterResource *parameterResource = rglGetParameterResource( ptr->program, ptr->parameterEntry );
   unsigned short resource = parameterResource->resource;

   //slow... skip the indices
   unsigned short *sharedResourcePtr = (( unsigned short * )( ptr->program->resources ) + resource );//no +1 here, we want the register
   int arrayIndex = index;
   while ( arrayIndex ) //jump to the right index... this is slow
   {
      sharedResourcePtr += (( *sharedResourcePtr ) + 2 );////+1 for the register, +1 for the count, +count for the number of embedded consts
      arrayIndex--;
   }
   unsigned short sharedResource = *sharedResourcePtr;

   const unsigned int * __restrict vi = ( const unsigned int* )v;

   GLuint dstVidOffset = gmmIdToOffset( driver->sharedFPConstantsId ) + sharedResource * 16;
   unsigned int values[4];
   values[0] = SWAP_IF_BIG_ENDIAN( vi[0] );
   values[1] = ( 1 < SIZE ) ? SWAP_IF_BIG_ENDIAN( vi[1] ) : 0;
   values[2] = ( 2 < SIZE ) ? SWAP_IF_BIG_ENDIAN( vi[2] ) : 0;
   values[3] = ( 3 < SIZE ) ? SWAP_IF_BIG_ENDIAN( vi[3] ) : 0;
   GCM_FUNC( cellGcmInlineTransfer, dstVidOffset, values, 4, 0 );

   LContext->needValidate |= RGL_VALIDATE_FRAGMENT_SHARED_CONSTANTS;
   // XXX we don't care about 32bit wrapping, do we ?
   ++LContext->LastFPConstantModification;
}
template<int SIZE> static void setVectorTypeSharedvpIndex( CgRuntimeParameter* __restrict ptr, const void* __restrict v, const int /*index*/ )
{
   const float * __restrict f = ( const float * __restrict )v;
   const CgParameterResource *parameterResource = rglGetParameterResource( ptr->program, ptr->parameterEntry );
   unsigned short resource = parameterResource->resource;
   float * __restrict dst = ( float * __restrict )ptr->pushBufferPointer;
   for ( long i = 0; i < SIZE; ++ i )
      dst[i] = f[i];
   rglPlatformSetVertexRegister4fv( resource, dst );
}

template<int SIZE> static void setVectorTypeSharedvpIndexArray( CgRuntimeParameter* __restrict ptr, const void* __restrict v, const int index )
{
   const float * __restrict f = ( const float * __restrict )v;
   const CgParameterResource *parameterResource = rglGetParameterResource( ptr->program, ptr->parameterEntry );
   unsigned short resource = parameterResource->resource + index; ///TODO: assume contiguous here , right ?
   float * __restrict dst = ( float * __restrict )ptr->pushBufferPointer;
   for ( long i = 0; i < SIZE; ++ i )
      dst[i] = f[i];
   rglPlatformSetVertexRegister4fv( resource, dst );
}


//  matrix uniforms
// note that Cg generated matrices are 1 row per binary param
// storage within the parameter is row major (so register setting is easier)

//tmp array tentative

#define ROW_MAJOR 0
#define COL_MAJOR 1

template <int SIZE> static void setVectorTypevpIndex( CgRuntimeParameter* __restrict ptr, const void* __restrict v, const int /*index*/ )
{
   RGLcontext * LContext = _CurrentContext;
   const float * __restrict f = ( const float* )v;
   float * __restrict dst = ( float* )ptr->pushBufferPointer;
   for ( long i = 0; i < SIZE; ++ i )
      dst[i] = f[i];
   LContext->needValidate |= RGL_VALIDATE_VERTEX_CONSTANTS;
}
template <int SIZE> static void setVectorTypevpIndexArray( CgRuntimeParameter* __restrict ptr, const void* __restrict v, const int index )
{
   RGLcontext * LContext = _CurrentContext;
   const float * __restrict f = ( const float* )v;
   float *  __restrict dst = ( float* )( *(( unsigned int ** )ptr->pushBufferPointer + index ) );
   for ( long i = 0; i < SIZE; ++ i )
      dst[i] = f[i];
   LContext->needValidate |= RGL_VALIDATE_VERTEX_CONSTANTS;
}
template<int SIZE> static void setVectorTypefpIndex( CgRuntimeParameter* __restrict ptr, const void* __restrict v, const int /*index*/ )
{
   float * __restrict  f = ( float* )v;
   float * __restrict  data = ( float* )ptr->pushBufferPointer;/*(float*)ptr->offset*;*/
   for ( long i = 0; i < SIZE; ++i ) //TODO: ced: find out if this loop for the get or for the reset in a future use of the same shader or just for the alignment???
      data[i] = f[i];
   _CGprogram *program = ptr->program;

   const CgParameterResource *parameterResource = rglGetParameterResource( program, ptr->parameterEntry );
   unsigned short resource = parameterResource->resource;
   unsigned short *ec = ( unsigned short * )( ptr->program->resources ) + resource + 1;
   if ( RGL_LIKELY( *ec ) )
   {
      swapandsetfp<SIZE>( program->header.instructionCount*16, program->loadProgramId, program->loadProgramOffset, ec, ( unsigned int * )data );
   }
}
template<int SIZE> static void setVectorTypefpIndexArray( CgRuntimeParameter* __restrict ptr, const void* __restrict v, const int index )
{
   float * __restrict  f = ( float* )v;
   float * __restrict  data = ( float* )ptr->pushBufferPointer;/*(float*)ptr->offset*;*/
   for ( long i = 0; i < SIZE; ++i ) //TODO: ced: find out if this loop for the get or for the reset in a future use of the same shader or just for the alignment???
      data[i] = f[i];
   _CGprogram *program = ptr->program;

   const CgParameterResource *parameterResource = rglGetParameterResource( program, ptr->parameterEntry );
   unsigned short resource = parameterResource->resource;
   unsigned short *ec = ( unsigned short * )( program->resources ) + resource + 1;
   int arrayIndex = index;
   while ( arrayIndex ) //jump to the right index... this is slow
   {
      ec += (( *ec ) + 2 );//+1 for the register, +1 for the count, +count for the number of embedded consts
      arrayIndex--;
   }
   if ( RGL_LIKELY( *ec ) )
   {
      swapandsetfp<SIZE>( program->header.instructionCount*16, program->loadProgramId, program->loadProgramOffset, ec, ( unsigned int * )data );
   }
}

//matrices
template <int ROWS, int COLS, int ORDER> static void setMatrixvpIndex( CgRuntimeParameter* __restrict ptr, const void* __restrict v, const int index )
{
   RGLcontext * LContext = _CurrentContext;
   float *  __restrict f = ( float* )v;
   float *  __restrict dst = ( float* )ptr->pushBufferPointer;
   for ( long row = 0; row < ROWS; ++row )
   {
      for ( long col = 0; col < COLS; ++col )
         dst[row * 4 + col] = ( ORDER == ROW_MAJOR ) ? f[row * COLS + col] : f[col * ROWS + row];
   }
   LContext->needValidate |= RGL_VALIDATE_VERTEX_CONSTANTS;
}

template <int ROWS, int COLS, int ORDER> static void setMatrixSharedvpIndex( CgRuntimeParameter*  __restrict ptr, const void*  __restrict v, const int /*index*/ )
{
   float * __restrict f = ( float* )v;
   float * __restrict dst = ( float* )ptr->pushBufferPointer;

   const CgParameterResource *parameterResource = rglGetParameterResource( ptr->program, ptr->parameterEntry );
   unsigned short resource = parameterResource->resource;

   float tmp[ROWS*4];
   for ( long row = 0; row < ROWS; ++row )
   {
      for ( long col = 0; col < COLS; ++col )
      {
         tmp[row*4 + col] = dst[row * 4 + col] = ( ORDER == ROW_MAJOR ) ? f[row * COLS + col] : f[col * ROWS + row];
      }
      for ( long col = COLS; col < 4; ++col ) tmp[row*4 + col] = dst[row*4+col];
   }

   GCM_FUNC( cellGcmSetVertexProgramParameterBlock, resource, ROWS, ( const float* )tmp );
}

template <int ROWS, int COLS, int ORDER> static void setMatrixSharedvpIndexArray( CgRuntimeParameter*  __restrict ptr, const void*  __restrict v, const int index )
{
   float * __restrict f = ( float* )v;
   float * __restrict dst = ( float* )ptr->pushBufferPointer;

   const CgParameterResource *parameterResource = rglGetParameterResource( ptr->program, ptr->parameterEntry );
   unsigned short resource = parameterResource->resource + index * ROWS;

   float tmp[ROWS*4];
   for ( long row = 0; row < ROWS; ++row )
   {
      for ( long col = 0; col < COLS; ++col )
      {
         tmp[row*4 + col] = dst[row * 4 + col] = ( ORDER == ROW_MAJOR ) ? f[row * COLS + col] : f[col * ROWS + row];
      }
      for ( long col = COLS; col < 4; ++col ) tmp[row*4 + col] = dst[row*4+col];
   }
   GCM_FUNC( cellGcmSetVertexProgramParameterBlock, resource, ROWS, tmp );
}

template <int ROWS, int COLS, int ORDER> static void setMatrixSharedfpIndex( CgRuntimeParameter* __restrict ptr, const void* __restrict v, const int /*index*/ )
{
   rglGcmDriver *driver = rglGetGcmDriver();

   const CgParameterResource *parameterResource = rglGetParameterResource( ptr->program, ptr->parameterEntry );
   unsigned short resource = parameterResource->resource;
   unsigned short sharedResource = *(( unsigned short * )( ptr->program->resources ) + resource );

   GLuint dstVidOffset = gmmIdToOffset( driver->sharedFPConstantsId ) + sharedResource * 16;
   //we assume that the assignment is contiguous
   const unsigned int * __restrict u = ( const unsigned int* )v;

   unsigned int tmp[ROWS*4];
   for ( long row = 0; row < ROWS; ++row )
   {
      tmp[row*4 + 0] = (( ORDER == ROW_MAJOR ) ? u[row * COLS + 0] : u[0 * ROWS + row] );
      tmp[row*4 + 1] = (( 1 < COLS ) ? (( ORDER == ROW_MAJOR ) ? u[row * COLS + 1] : u[1 * ROWS + row] ) : 0 );
      tmp[row*4 + 2] = (( 2 < COLS ) ? (( ORDER == ROW_MAJOR ) ? u[row * COLS + 2] : u[2 * ROWS + row] ) : 0 );
      tmp[row*4 + 3] = (( 3 < COLS ) ? (( ORDER == ROW_MAJOR ) ? u[row * COLS + 3] : u[3 * ROWS + row] ) : 0 );
   }
   GCM_FUNC( cellGcmSetTransferLocation, CELL_GCM_LOCATION_LOCAL );
   void *pointer=NULL;
   GCM_FUNC( cellGcmSetInlineTransferPointer, dstVidOffset, 4*ROWS, &pointer);
   float *fp = (float*)pointer;
   float *src = (float*)tmp;
   for (uint32_t j=0; j<ROWS;j++)
   {
      fp[0] = cellGcmSwap16Float32(src[0]);
      fp[1] = cellGcmSwap16Float32(src[1]);
      fp[2] = cellGcmSwap16Float32(src[2]);
      fp[3] = cellGcmSwap16Float32(src[3]);
      fp+=4;src+=4;
   }

   RGLcontext * LContext = _CurrentContext;
   LContext->needValidate |= RGL_VALIDATE_FRAGMENT_SHARED_CONSTANTS;
   ++LContext->LastFPConstantModification;
}

template <int ROWS, int COLS, int ORDER> static void setMatrixSharedfpIndexArray( CgRuntimeParameter* __restrict ptr, const void* __restrict v, const int index )
{
   //TODO: double check for the semi endian swap... not done here, is it done by the RSX ?
   rglGcmDriver *driver = rglGetGcmDriver();

   const CgParameterResource *parameterResource = rglGetParameterResource( ptr->program, ptr->parameterEntry );
   unsigned short resource = parameterResource->resource;
   //slow... skip the indices
   unsigned short *sharedResourcePtr = (( unsigned short * )( ptr->program->resources ) + resource );
   int arrayIndex = index * ROWS;
   while ( arrayIndex ) //jump to the right index... this is slow
   {
      sharedResourcePtr += (( *sharedResourcePtr ) + 2 );//+1 for the register, +1 for the count, +count for the number of embedded consts
      arrayIndex--;
   }
   unsigned short sharedResource = *sharedResourcePtr;

   GLuint dstVidOffset = gmmIdToOffset( driver->sharedFPConstantsId ) + sharedResource * 16;
   //we assume that the assignment is contiguous
   const unsigned int * __restrict u = ( const unsigned int* )v;

   unsigned int tmp[ROWS*4];
   for ( long row = 0; row < ROWS; ++row )
   {
      tmp[row*4 + 0] = (( ORDER == ROW_MAJOR ) ? u[row * COLS + 0] : u[0 * ROWS + row] );
      tmp[row*4 + 1] = (( 1 < COLS ) ? (( ORDER == ROW_MAJOR ) ? u[row * COLS + 1] : u[1 * ROWS + row] ) : 0 );
      tmp[row*4 + 2] = (( 2 < COLS ) ? (( ORDER == ROW_MAJOR ) ? u[row * COLS + 2] : u[2 * ROWS + row] ) : 0 );
      tmp[row*4 + 3] = (( 3 < COLS ) ? (( ORDER == ROW_MAJOR ) ? u[row * COLS + 3] : u[3 * ROWS + row] ) : 0 );
   }

   GCM_FUNC( cellGcmSetTransferLocation, CELL_GCM_LOCATION_LOCAL );


   void *pointer=NULL;
   GCM_FUNC( cellGcmSetInlineTransferPointer, dstVidOffset, 4*ROWS, &pointer);
   float *fp = (float*)pointer;
   const float *src = (const float*)tmp;
   for (uint32_t j=0; j<4*ROWS;j++)
   {
      *fp = cellGcmSwap16Float32(*src);
      fp++;src++;
   }

   RGLcontext * LContext = _CurrentContext;
   LContext->needValidate |= RGL_VALIDATE_FRAGMENT_SHARED_CONSTANTS;
   ++LContext->LastFPConstantModification;
}

//TODO ?: check: //works only for the consecutive alloc...
template <int ROWS, int COLS, int ORDER> static void setMatrixvpIndexArray( CgRuntimeParameter* __restrict ptr, const void* __restrict v, const int index )
{
   RGLcontext * LContext = _CurrentContext;
   float *  __restrict f = ( float* )v;
   float *  __restrict dst = ( float* )( *(( unsigned int ** )ptr->pushBufferPointer + index ) );
   for ( long row = 0; row < ROWS; ++row )
   {
      for ( long col = 0; col < COLS; ++col )
         dst[row * 4 + col] = ( ORDER == ROW_MAJOR ) ? f[row * COLS + col] : f[col * ROWS + row];
   }
   LContext->needValidate |= RGL_VALIDATE_VERTEX_CONSTANTS;
}
template <int ROWS, int COLS, int ORDER> static void setMatrixfpIndex( CgRuntimeParameter* __restrict ptr, const void* __restrict v, const int /*index*/ )
{
   float *  __restrict f = ( float* )v;
   float *  __restrict dst = ( float* )ptr->pushBufferPointer;
   _CGprogram *program = (( CgRuntimeParameter* )ptr )->program;
   const CgParameterResource *parameterResource = rglGetParameterResource( program, ptr->parameterEntry );
   unsigned short resource = parameterResource->resource;
   unsigned short *ec = ( unsigned short * )program->resources + resource + 1; //+1 to skip the register
   for ( long row = 0; row < ROWS; ++row )
   {
      for ( long col = 0; col < COLS; ++col )
         dst[row * 4 + col] = ( ORDER == ROW_MAJOR ) ? f[row * COLS + col] : f[col * ROWS + row];
      int count = *ec;
      if ( RGL_LIKELY( count ) )
      {
         swapandsetfp<COLS>( program->header.instructionCount*16, program->loadProgramId, program->loadProgramOffset, ec, ( unsigned int * )dst + row * 4 );
      }
      ec += count + 2; //+1 for the register, +1 for the count, + count for the number of embedded consts
   }
}
template <int ROWS, int COLS, int ORDER> static void setMatrixfpIndexArray( CgRuntimeParameter* __restrict ptr, const void* __restrict v, const int index )
{
   float *  __restrict f = ( float* )v;
   float *  __restrict dst = ( float* )ptr->pushBufferPointer;
   _CGprogram *program = ptr->program;
   const CgParameterResource *parameterResource = rglGetParameterResource( program, ptr->parameterEntry );
   unsigned short resource = parameterResource->resource;
   unsigned short *ec = ( unsigned short * )program->resources + resource + 1;//+1 to skip the register
   int arrayIndex = index * ROWS;
   while ( arrayIndex ) //jump to the right index... this is slow
   {
      unsigned short count = ( *ec );
      ec += ( count + 2 ); //+1 for the register, +1 for the count, +count for the number of embedded consts
      arrayIndex--;
   }
   for ( long row = 0; row < ROWS; ++row )
   {
      for ( long col = 0; col < COLS; ++col )
         dst[row * 4 + col] = ( ORDER == ROW_MAJOR ) ? f[row * COLS + col] : f[col * ROWS + row];
      int count = *ec;
      if ( RGL_LIKELY( count ) )
      {
         swapandsetfp<COLS>( program->header.instructionCount*16, program->loadProgramId, program->loadProgramOffset, ec, ( unsigned int * )dst + row * 4 );
      }
      ec += count + 2;//+1 for the register, +1 for the count, +count for the number of embedded consts
   }
}

static _cgSetArrayIndexFunction setVectorTypeIndex[2][2][2][4] =
{
   {
      {
         {&setVectorTypevpIndex<1>, &setVectorTypevpIndex<2>, &setVectorTypevpIndex<3>, &setVectorTypevpIndex<4>, },
         {&setVectorTypefpIndex<1>, &setVectorTypefpIndex<2>, &setVectorTypefpIndex<3>, &setVectorTypefpIndex<4>, }
      },
      {
         {&setVectorTypeSharedvpIndex<1>, &setVectorTypeSharedvpIndex<2>, &setVectorTypeSharedvpIndex<3>, &setVectorTypeSharedvpIndex<4>, }, //should be the shared
         {&setVectorTypeSharedfpIndex<1>, &setVectorTypeSharedfpIndex<2>, &setVectorTypeSharedfpIndex<3>, &setVectorTypeSharedfpIndex<4>, } //should be the shared
      },
   },
   {
      {
         {&setVectorTypevpIndexArray<1>, &setVectorTypevpIndexArray<2>, &setVectorTypevpIndexArray<3>, &setVectorTypevpIndexArray<4>, },
         {&setVectorTypefpIndexArray<1>, &setVectorTypefpIndexArray<2>, &setVectorTypefpIndexArray<3>, &setVectorTypefpIndexArray<4>, }
      },
      {
         {&setVectorTypeSharedvpIndexArray<1>, &setVectorTypeSharedvpIndexArray<2>, &setVectorTypeSharedvpIndexArray<3>, &setVectorTypeSharedvpIndexArray<4>, }, //should be the shared
         {&setVectorTypeSharedfpIndexArray<1>, &setVectorTypeSharedfpIndexArray<2>, &setVectorTypeSharedfpIndexArray<3>, &setVectorTypeSharedfpIndexArray<4>, } //should be the shared
      },
   },
};

static _cgSetArrayIndexFunction setMatrixTypeIndex[2][2][2][4][4][2] =
{
   {
      {
         {
            {{ &setMatrixvpIndex<1, 1, 0>, &setMatrixvpIndex<1, 1, 1>}, { &setMatrixvpIndex<1, 2, 0>, &setMatrixvpIndex<1, 2, 1>}, { &setMatrixvpIndex<1, 3, 0>, &setMatrixvpIndex<1, 3, 1>}, { &setMatrixvpIndex<1, 4, 0>, &setMatrixvpIndex<1, 4, 1>}},
            {{ &setMatrixvpIndex<2, 1, 0>, &setMatrixvpIndex<2, 1, 1>}, { &setMatrixvpIndex<2, 2, 0>, &setMatrixvpIndex<2, 2, 1>}, { &setMatrixvpIndex<2, 3, 0>, &setMatrixvpIndex<2, 3, 1>}, { &setMatrixvpIndex<2, 4, 0>, &setMatrixvpIndex<2, 4, 1>}},
            {{ &setMatrixvpIndex<3, 1, 0>, &setMatrixvpIndex<3, 1, 1>}, { &setMatrixvpIndex<3, 2, 0>, &setMatrixvpIndex<3, 2, 1>}, { &setMatrixvpIndex<3, 3, 0>, &setMatrixvpIndex<3, 3, 1>}, { &setMatrixvpIndex<3, 4, 0>, &setMatrixvpIndex<3, 4, 1>}},
            {{ &setMatrixvpIndex<4, 1, 0>, &setMatrixvpIndex<4, 1, 1>}, { &setMatrixvpIndex<4, 2, 0>, &setMatrixvpIndex<4, 2, 1>}, { &setMatrixvpIndex<4, 3, 0>, &setMatrixvpIndex<4, 3, 1>}, { &setMatrixvpIndex<4, 4, 0>, &setMatrixvpIndex<4, 4, 1>}},
         },
         {
            {{ &setMatrixfpIndex<1, 1, 0>, &setMatrixfpIndex<1, 1, 1>}, { &setMatrixfpIndex<1, 2, 0>, &setMatrixfpIndex<1, 2, 1>}, { &setMatrixfpIndex<1, 3, 0>, &setMatrixfpIndex<1, 3, 1>}, { &setMatrixfpIndex<1, 4, 0>, &setMatrixfpIndex<1, 4, 1>}},
            {{ &setMatrixfpIndex<2, 1, 0>, &setMatrixfpIndex<2, 1, 1>}, { &setMatrixfpIndex<2, 2, 0>, &setMatrixfpIndex<2, 2, 1>}, { &setMatrixfpIndex<2, 3, 0>, &setMatrixfpIndex<2, 3, 1>}, { &setMatrixfpIndex<2, 4, 0>, &setMatrixfpIndex<2, 4, 1>}},
            {{ &setMatrixfpIndex<3, 1, 0>, &setMatrixfpIndex<3, 1, 1>}, { &setMatrixfpIndex<3, 2, 0>, &setMatrixfpIndex<3, 2, 1>}, { &setMatrixfpIndex<3, 3, 0>, &setMatrixfpIndex<3, 3, 1>}, { &setMatrixfpIndex<3, 4, 0>, &setMatrixfpIndex<3, 4, 1>}},
            {{ &setMatrixfpIndex<4, 1, 0>, &setMatrixfpIndex<4, 1, 1>}, { &setMatrixfpIndex<4, 2, 0>, &setMatrixfpIndex<4, 2, 1>}, { &setMatrixfpIndex<4, 3, 0>, &setMatrixfpIndex<4, 3, 1>}, { &setMatrixfpIndex<4, 4, 0>, &setMatrixfpIndex<4, 4, 1>}},
         },
      },
      { //should be shared
         {
            {{ &setMatrixSharedvpIndex<1, 1, 0>, &setMatrixSharedvpIndex<1, 1, 1>}, { &setMatrixSharedvpIndex<1, 2, 0>, &setMatrixSharedvpIndex<1, 2, 1>}, { &setMatrixSharedvpIndex<1, 3, 0>, &setMatrixSharedvpIndex<1, 3, 1>}, { &setMatrixSharedvpIndex<1, 4, 0>, &setMatrixSharedvpIndex<1, 4, 1>}},
            {{ &setMatrixSharedvpIndex<2, 1, 0>, &setMatrixSharedvpIndex<2, 1, 1>}, { &setMatrixSharedvpIndex<2, 2, 0>, &setMatrixSharedvpIndex<2, 2, 1>}, { &setMatrixSharedvpIndex<2, 3, 0>, &setMatrixSharedvpIndex<2, 3, 1>}, { &setMatrixSharedvpIndex<2, 4, 0>, &setMatrixSharedvpIndex<2, 4, 1>}},
            {{ &setMatrixSharedvpIndex<3, 1, 0>, &setMatrixSharedvpIndex<3, 1, 1>}, { &setMatrixSharedvpIndex<3, 2, 0>, &setMatrixSharedvpIndex<3, 2, 1>}, { &setMatrixSharedvpIndex<3, 3, 0>, &setMatrixSharedvpIndex<3, 3, 1>}, { &setMatrixSharedvpIndex<3, 4, 0>, &setMatrixSharedvpIndex<3, 4, 1>}},
            {{ &setMatrixSharedvpIndex<4, 1, 0>, &setMatrixSharedvpIndex<4, 1, 1>}, { &setMatrixSharedvpIndex<4, 2, 0>, &setMatrixSharedvpIndex<4, 2, 1>}, { &setMatrixSharedvpIndex<4, 3, 0>, &setMatrixSharedvpIndex<4, 3, 1>}, { &setMatrixSharedvpIndex<4, 4, 0>, &setMatrixSharedvpIndex<4, 4, 1>}},
         },
         {
            {{ &setMatrixSharedfpIndex<1, 1, 0>, &setMatrixSharedfpIndex<1, 1, 1>}, { &setMatrixSharedfpIndex<1, 2, 0>, &setMatrixSharedfpIndex<1, 2, 1>}, { &setMatrixSharedfpIndex<1, 3, 0>, &setMatrixSharedfpIndex<1, 3, 1>}, { &setMatrixSharedfpIndex<1, 4, 0>, &setMatrixSharedfpIndex<1, 4, 1>}},
            {{ &setMatrixSharedfpIndex<2, 1, 0>, &setMatrixSharedfpIndex<2, 1, 1>}, { &setMatrixSharedfpIndex<2, 2, 0>, &setMatrixSharedfpIndex<2, 2, 1>}, { &setMatrixSharedfpIndex<2, 3, 0>, &setMatrixSharedfpIndex<2, 3, 1>}, { &setMatrixSharedfpIndex<2, 4, 0>, &setMatrixSharedfpIndex<2, 4, 1>}},
            {{ &setMatrixSharedfpIndex<3, 1, 0>, &setMatrixSharedfpIndex<3, 1, 1>}, { &setMatrixSharedfpIndex<3, 2, 0>, &setMatrixSharedfpIndex<3, 2, 1>}, { &setMatrixSharedfpIndex<3, 3, 0>, &setMatrixSharedfpIndex<3, 3, 1>}, { &setMatrixSharedfpIndex<3, 4, 0>, &setMatrixSharedfpIndex<3, 4, 1>}},
            {{ &setMatrixSharedfpIndex<4, 1, 0>, &setMatrixSharedfpIndex<4, 1, 1>}, { &setMatrixSharedfpIndex<4, 2, 0>, &setMatrixSharedfpIndex<4, 2, 1>}, { &setMatrixSharedfpIndex<4, 3, 0>, &setMatrixSharedfpIndex<4, 3, 1>}, { &setMatrixSharedfpIndex<4, 4, 0>, &setMatrixSharedfpIndex<4, 4, 1>}},
         },
      },
   },
   {
      {
         {
            {{ &setMatrixvpIndexArray<1, 1, 0>, &setMatrixvpIndexArray<1, 1, 1>}, { &setMatrixvpIndexArray<1, 2, 0>, &setMatrixvpIndexArray<1, 2, 1>}, { &setMatrixvpIndexArray<1, 3, 0>, &setMatrixvpIndexArray<1, 3, 1>}, { &setMatrixvpIndexArray<1, 4, 0>, &setMatrixvpIndexArray<1, 4, 1>}},
            {{ &setMatrixvpIndexArray<2, 1, 0>, &setMatrixvpIndexArray<2, 1, 1>}, { &setMatrixvpIndexArray<2, 2, 0>, &setMatrixvpIndexArray<2, 2, 1>}, { &setMatrixvpIndexArray<2, 3, 0>, &setMatrixvpIndexArray<2, 3, 1>}, { &setMatrixvpIndexArray<2, 4, 0>, &setMatrixvpIndexArray<2, 4, 1>}},
            {{ &setMatrixvpIndexArray<3, 1, 0>, &setMatrixvpIndexArray<3, 1, 1>}, { &setMatrixvpIndexArray<3, 2, 0>, &setMatrixvpIndexArray<3, 2, 1>}, { &setMatrixvpIndexArray<3, 3, 0>, &setMatrixvpIndexArray<3, 3, 1>}, { &setMatrixvpIndexArray<3, 4, 0>, &setMatrixvpIndexArray<3, 4, 1>}},
            {{ &setMatrixvpIndexArray<4, 1, 0>, &setMatrixvpIndexArray<4, 1, 1>}, { &setMatrixvpIndexArray<4, 2, 0>, &setMatrixvpIndexArray<4, 2, 1>}, { &setMatrixvpIndexArray<4, 3, 0>, &setMatrixvpIndexArray<4, 3, 1>}, { &setMatrixvpIndexArray<4, 4, 0>, &setMatrixvpIndexArray<4, 4, 1>}},
         },
         {
            {{ &setMatrixfpIndexArray<1, 1, 0>, &setMatrixfpIndexArray<1, 1, 1>}, { &setMatrixfpIndexArray<1, 2, 0>, &setMatrixfpIndexArray<1, 2, 1>}, { &setMatrixfpIndexArray<1, 3, 0>, &setMatrixfpIndexArray<1, 3, 1>}, { &setMatrixfpIndexArray<1, 4, 0>, &setMatrixfpIndexArray<1, 4, 1>}},
            {{ &setMatrixfpIndexArray<2, 1, 0>, &setMatrixfpIndexArray<2, 1, 1>}, { &setMatrixfpIndexArray<2, 2, 0>, &setMatrixfpIndexArray<2, 2, 1>}, { &setMatrixfpIndexArray<2, 3, 0>, &setMatrixfpIndexArray<2, 3, 1>}, { &setMatrixfpIndexArray<2, 4, 0>, &setMatrixfpIndexArray<2, 4, 1>}},
            {{ &setMatrixfpIndexArray<3, 1, 0>, &setMatrixfpIndexArray<3, 1, 1>}, { &setMatrixfpIndexArray<3, 2, 0>, &setMatrixfpIndexArray<3, 2, 1>}, { &setMatrixfpIndexArray<3, 3, 0>, &setMatrixfpIndexArray<3, 3, 1>}, { &setMatrixfpIndexArray<3, 4, 0>, &setMatrixfpIndexArray<3, 4, 1>}},
            {{ &setMatrixfpIndexArray<4, 1, 0>, &setMatrixfpIndexArray<4, 1, 1>}, { &setMatrixfpIndexArray<4, 2, 0>, &setMatrixfpIndexArray<4, 2, 1>}, { &setMatrixfpIndexArray<4, 3, 0>, &setMatrixfpIndexArray<4, 3, 1>}, { &setMatrixfpIndexArray<4, 4, 0>, &setMatrixfpIndexArray<4, 4, 1>}},
         },
      },
      { //should be shared
         {
            {{ &setMatrixSharedvpIndexArray<1, 1, 0>, &setMatrixSharedvpIndexArray<1, 1, 1>}, { &setMatrixSharedvpIndexArray<1, 2, 0>, &setMatrixSharedvpIndexArray<1, 2, 1>}, { &setMatrixSharedvpIndexArray<1, 3, 0>, &setMatrixSharedvpIndexArray<1, 3, 1>}, { &setMatrixSharedvpIndexArray<1, 4, 0>, &setMatrixSharedvpIndexArray<1, 4, 1>}},
            {{ &setMatrixSharedvpIndexArray<2, 1, 0>, &setMatrixSharedvpIndexArray<2, 1, 1>}, { &setMatrixSharedvpIndexArray<2, 2, 0>, &setMatrixSharedvpIndexArray<2, 2, 1>}, { &setMatrixSharedvpIndexArray<2, 3, 0>, &setMatrixSharedvpIndexArray<2, 3, 1>}, { &setMatrixSharedvpIndexArray<2, 4, 0>, &setMatrixSharedvpIndexArray<2, 4, 1>}},
            {{ &setMatrixSharedvpIndexArray<3, 1, 0>, &setMatrixSharedvpIndexArray<3, 1, 1>}, { &setMatrixSharedvpIndexArray<3, 2, 0>, &setMatrixSharedvpIndexArray<3, 2, 1>}, { &setMatrixSharedvpIndexArray<3, 3, 0>, &setMatrixSharedvpIndexArray<3, 3, 1>}, { &setMatrixSharedvpIndexArray<3, 4, 0>, &setMatrixSharedvpIndexArray<3, 4, 1>}},
            {{ &setMatrixSharedvpIndexArray<4, 1, 0>, &setMatrixSharedvpIndexArray<4, 1, 1>}, { &setMatrixSharedvpIndexArray<4, 2, 0>, &setMatrixSharedvpIndexArray<4, 2, 1>}, { &setMatrixSharedvpIndexArray<4, 3, 0>, &setMatrixSharedvpIndexArray<4, 3, 1>}, { &setMatrixSharedvpIndexArray<4, 4, 0>, &setMatrixSharedvpIndexArray<4, 4, 1>}},
         },
         {
            {{ &setMatrixSharedfpIndexArray<1, 1, 0>, &setMatrixSharedfpIndexArray<1, 1, 1>}, { &setMatrixSharedfpIndexArray<1, 2, 0>, &setMatrixSharedfpIndexArray<1, 2, 1>}, { &setMatrixSharedfpIndexArray<1, 3, 0>, &setMatrixSharedfpIndexArray<1, 3, 1>}, { &setMatrixSharedfpIndexArray<1, 4, 0>, &setMatrixSharedfpIndexArray<1, 4, 1>}},
            {{ &setMatrixSharedfpIndexArray<2, 1, 0>, &setMatrixSharedfpIndexArray<2, 1, 1>}, { &setMatrixSharedfpIndexArray<2, 2, 0>, &setMatrixSharedfpIndexArray<2, 2, 1>}, { &setMatrixSharedfpIndexArray<2, 3, 0>, &setMatrixSharedfpIndexArray<2, 3, 1>}, { &setMatrixSharedfpIndexArray<2, 4, 0>, &setMatrixSharedfpIndexArray<2, 4, 1>}},
            {{ &setMatrixSharedfpIndexArray<3, 1, 0>, &setMatrixSharedfpIndexArray<3, 1, 1>}, { &setMatrixSharedfpIndexArray<3, 2, 0>, &setMatrixSharedfpIndexArray<3, 2, 1>}, { &setMatrixSharedfpIndexArray<3, 3, 0>, &setMatrixSharedfpIndexArray<3, 3, 1>}, { &setMatrixSharedfpIndexArray<3, 4, 0>, &setMatrixSharedfpIndexArray<3, 4, 1>}},
            {{ &setMatrixSharedfpIndexArray<4, 1, 0>, &setMatrixSharedfpIndexArray<4, 1, 1>}, { &setMatrixSharedfpIndexArray<4, 2, 0>, &setMatrixSharedfpIndexArray<4, 2, 1>}, { &setMatrixSharedfpIndexArray<4, 3, 0>, &setMatrixSharedfpIndexArray<4, 3, 1>}, { &setMatrixSharedfpIndexArray<4, 4, 0>, &setMatrixSharedfpIndexArray<4, 4, 1>}},
         },
      },
   }
};

_cgSetArrayIndexFunction getVectorTypeIndexSetterFunction( unsigned short a, unsigned short b, unsigned short c, unsigned short d )
{
   return setVectorTypeIndex[a][b][c][d];
}

_cgSetArrayIndexFunction getMatrixTypeIndexSetterFunction( unsigned short a, unsigned short b, unsigned short c, unsigned short d, unsigned short e, unsigned short f )
{
   return setMatrixTypeIndex[a][b][c][d][e][f];
}

static void setSamplerfp( CgRuntimeParameter*ptr, const void*v, int ) //index
{
   _CGprogram *program = (( CgRuntimeParameter* )ptr )->program;
   const CgParameterResource *parameterResource = rglGetParameterResource( program, (( CgRuntimeParameter* )ptr )->parameterEntry );

   // the value of v == NULL when it is called from cgGLEnableTextureParameter
   // the value of v == NULL when it is called from  cgGLSetTextureParameter
   // this may be called by a connected param to propagate its value
   // the spec says that the set should not cause the bind
   // so only do the bind when the call comes from cgGLEnableTextureParameter
   if ( v )
   {
      *( GLuint* )ptr->pushBufferPointer = *( GLuint* )v;
   }
   else
   {
      rglTextureImageUnit *unit = _CurrentContext->TextureImageUnits + ( parameterResource->resource - CG_TEXUNIT0 );
      rglBindTextureInternal( unit, *( GLuint* )ptr->pushBufferPointer, ptr->glType );
   }
}

static void setSamplervp( CgRuntimeParameter*ptr, const void*v, int ) //index
{
   // the value of v == NULL when it is called from cgGLEnableTextureParameter
   // the value of v == NULL when it is called from  cgGLSetTextureParameter
   // this may be called by a connected param to propagate its value
   // the spec says that the set should not cause the bind
   // so only do the bind when the call comes from cgGLEnableTextureParameter
   if ( v )
   {
      *( GLuint* )ptr->pushBufferPointer = *( GLuint* )v;
   }
}


#undef ROW_MAJOR
#undef COL_MAJOR
// Previously from Shader.cpp

//---------------------------------------------------------------------------------------------------------
#define ROW_MAJOR 0
#define COL_MAJOR 1

//This function creates the push buffer and the related structures
void rglCreatePushBuffer( _CGprogram *program )
{
   //first pass to compute the space needed
   int bufferSize = 0;
   int programPushBufferPointersSize = 0;
   int extraStorageInWords = 0;
   int offsetCount = 0;
   int samplerCount = 0;
   int profileIndex = ( program->header.profile == CG_PROFILE_SCE_FP_TYPEB || //program->header.profile==CG_PROFILE_SCE_FP_TYPEC ||
         program->header.profile == CG_PROFILE_SCE_FP_RSX ) ? FRAGMENT_PROFILE_INDEX : VERTEX_PROFILE_INDEX;

   bool hasSharedParams = false;
   int arrayCount = 1;
   for ( int i = 0;i < program->rtParametersCount;i++ )
   {
      const CgParameterEntry *parameterEntry = program->parametersEntries + i;

      //skip the unrolled arrays and the structures
      if (( parameterEntry->flags & CGP_STRUCTURE ) || ( parameterEntry->flags & CGP_UNROLLED ) )
      {
         arrayCount = 1;
         continue;
      }

      if (( parameterEntry->flags & CGPF_REFERENCED ) )
      {
         if ( parameterEntry->flags & CGP_ARRAY )
         {
            const CgParameterArray *parameterArray = rglGetParameterArray( program, parameterEntry );
            arrayCount = rglGetSizeofSubArray( parameterArray->dimensions, parameterArray->dimensionCount );
            continue;
         }
         if (( parameterEntry->flags & CGPV_MASK ) == CGPV_UNIFORM )
         {
            const CgParameterResource *parameterResource = rglGetParameterResource( program, parameterEntry );
            if ( parameterResource->type >= CG_SAMPLER1D &&  parameterResource->type <= CG_SAMPLERCUBE )
            {
               // store 1 sampler and 1 offset for texture samplers.
               offsetCount += arrayCount;
               samplerCount += arrayCount;
            }
            else if ( profileIndex == VERTEX_PROFILE_INDEX )
            {
               if ( parameterResource->type == CGP_SCF_BOOL )
               {
                  //do nothing
               }
               else if ( !( parameterEntry->flags & CGPF_SHARED ) )
               {
                  int registerStride = isMatrix(( CGtype )parameterResource->type ) ? rglGetTypeRowCount(( CGtype )parameterResource->type ) : 1;
                  if ( parameterEntry->flags & CGP_CONTIGUOUS )
                     bufferSize += 3 + 4 * arrayCount * registerStride;
                  else
                  {
                     programPushBufferPointersSize += arrayCount;
                     int resourceIndex = parameterResource->resource;
                     int referencedSize = 3 + 4 * registerStride;
                     int notReferencedSize = 4 * registerStride;
                     for ( int j = 0;j < arrayCount;j++, resourceIndex += registerStride )
                     {
                        //I use the programPushBuffer pointer so it's valid to have element in an array without any affectation
                        if ( program->resources[resourceIndex] != 0xffff )
                           bufferSize += referencedSize; //referenced: push buffer
                        else
                           extraStorageInWords += notReferencedSize; //not referenced , extra storage location
                     }
                  }
               }
               else
               {
                  hasSharedParams = true;
                  if ( !( parameterEntry->flags & CGP_CONTIGUOUS ) )
                  {
                     programPushBufferPointersSize += arrayCount;
                  }
               }
            }
            else //profileIndex == FRAGMENT_PROFILE_INDEX
            {
               int registerStride = isMatrix(( CGtype )parameterResource->type ) ? rglGetTypeRowCount(( CGtype )parameterResource->type ) : 1;
               if ( !( parameterEntry->flags & CGPF_SHARED ) )
               {
                  //TODO: check this case
                  extraStorageInWords += 4 * arrayCount * registerStride;
               }
               else
               {
                  hasSharedParams = true;
                  unsigned short *resource = program->resources + parameterResource->resource;
                  for ( int j = 0;j < arrayCount*registerStride;j++ )
                  {
                     resource++;
                     unsigned short count = *resource++;
                     bufferSize += 24 * count;
                     resource += count;
                  }
               }
            }
         }
      }
      arrayCount = 1;
   }

   if (( profileIndex == FRAGMENT_PROFILE_INDEX ) && ( hasSharedParams ) )
   {
      bufferSize += 8 + 3 + 2; // GCM_PORT_TESTED [CEDRIC] +3 for the channel switch that gcm does + 2 for the OUT end
   }

   bufferSize = rglPad( bufferSize, 4 );

   //allocate the buffer(s)
   unsigned int storageSizeInWords = bufferSize + extraStorageInWords;
   if ( storageSizeInWords )
      program->memoryBlock = ( unsigned int* )memalign( 16, storageSizeInWords * 4 );
   else
      program->memoryBlock = NULL;

   //TODO: this is tmp
   program->samplerCount = samplerCount;
   if ( samplerCount )
   {
      program->samplerValuesLocation = ( GLuint* )malloc( samplerCount * sizeof( GLuint ) );
      program->samplerIndices = ( GLuint* )malloc( samplerCount * sizeof( GLuint ) );
      program->samplerUnits = ( GLuint* )malloc( samplerCount * sizeof( GLuint ) );
   }
   else
   {
      program->samplerValuesLocation = NULL;
      program->samplerIndices = NULL;
      program->samplerUnits = NULL;
   }

   GLuint *samplerValuesLocation = program->samplerValuesLocation;
   GLuint *samplerIndices = program->samplerIndices;
   GLuint *samplerUnits = program->samplerUnits;

   if ( programPushBufferPointersSize )
      program->constantPushBufferPointers = ( unsigned int** )malloc( programPushBufferPointersSize * 4 );
   else
      program->constantPushBufferPointers = NULL;

   uint32_t *rglGcmCurrent = (uint32_t*)program->memoryBlock;
   program->constantPushBuffer = ( bufferSize > 0 ) ? ( unsigned int * )rglGcmCurrent : NULL;
   unsigned int **programPushBuffer = program->constantPushBufferPointers;
   program->constantPushBufferWordSize = bufferSize;
   GLuint *currentStorage = ( GLuint * )( rglGcmCurrent + bufferSize );

   int outOfMemory = 0;
   //second pass to fill the buffer
   arrayCount = 1;
   const CgParameterEntry *containerEntry = NULL;
   for ( int i = 0;i < program->rtParametersCount;i++ )
   {
      CgRuntimeParameter *rtParameter = program->runtimeParameters + i;
      const CgParameterEntry *parameterEntry = program->parametersEntries + i;
      if ( containerEntry == NULL )
         containerEntry = parameterEntry;

      //rtParameter->setter = _cgRaiseInvalidParam;
      //rtParameter->setterr = _cgRaiseNotMatrixParam;
      //rtParameter->setterc = _cgRaiseNotMatrixParam;
      rtParameter->samplerSetter = _cgRaiseInvalidParamIndex;

      //tentative
      rtParameter->setterIndex = _cgRaiseInvalidParamIndex;
      rtParameter->setterrIndex = _cgRaiseNotMatrixParamIndex;
      rtParameter->settercIndex = _cgRaiseNotMatrixParamIndex;

      CGparameter id = ( CGparameter )rglCreateName( &_CurrentContext->cgParameterNameSpace, ( void* )rtParameter );
      if ( !id )
      {
         outOfMemory = 1;
         break;
      }

      rtParameter->id = id;
      rtParameter->parameterEntry = parameterEntry;
      rtParameter->program = program;

      //skip the unrolled arrays and the structures
      if (( parameterEntry->flags & CGP_STRUCTURE ) || ( parameterEntry->flags & CGP_UNROLLED ) )
      {
         arrayCount = 1;
         containerEntry = NULL;
         continue;
      }

      if ( parameterEntry->flags & CGPF_REFERENCED )
      {
         if ( parameterEntry->flags & CGP_ARRAY )
         {
            const CgParameterArray *parameterArray = rglGetParameterArray( program, parameterEntry );
            arrayCount = rglGetSizeofSubArray( parameterArray->dimensions, parameterArray->dimensionCount );
            //continue to the next item
            continue;
         }
         if (( parameterEntry->flags & CGPV_MASK ) == CGPV_UNIFORM )
         {
            //TODO: rtParameter->defaultNormalize = CG_FALSE;
            rtParameter->glType = GL_NONE;
            //TODO: needed ? rtParameter->flags = 0;
            const CgParameterResource *parameterResource = rglGetParameterResource( program, parameterEntry );
            if ( parameterResource->type >= CG_SAMPLER1D && parameterResource->type <= CG_SAMPLERCUBE )
            {
               //TODO
               rtParameter->pushBufferPointer = samplerValuesLocation;
               // initialize the texture name to zero, used by the setSamplerfp call to rglBindTextureInternal
               *samplerValuesLocation = 0;
               samplerValuesLocation++;

               // store the texture unit indices.
               *samplerIndices = i;
               samplerIndices++;
               *samplerUnits = parameterResource->resource - CG_TEXUNIT0;
               samplerUnits++;

               // XXX the setter is called when validating vertex programs.
               // this would cause a CG error.
               // the parameters should have a "validate" function instead
               if ( profileIndex == VERTEX_PROFILE_INDEX )
               {
                  rtParameter->setterIndex = _cgIgnoreSetParamIndex;
                  rtParameter->samplerSetter = setSamplervp;
               }
               else
               {
                  rtParameter->samplerSetter = setSamplerfp;
               }
               rtParameter->glType = rglCgGetSamplerGLTypeFromCgType(( CGtype )( parameterResource->type ) );
            }
            else
            {
               if ( profileIndex == VERTEX_PROFILE_INDEX )
               {
                  if ( parameterResource->type == CGP_SCF_BOOL )
                  {
                     //do nothing
                  }
                  else if ( !( parameterEntry->flags & CGPF_SHARED ) )
                  {
                     int registerStride = isMatrix(( CGtype )parameterResource->type ) ? rglGetTypeRowCount(( CGtype )parameterResource->type ) : 1;
                     int registerCount = arrayCount * registerStride;
                     if ( parameterEntry->flags & CGP_CONTIGUOUS )
                     {
                        memset( rglGcmCurrent, 0, 4*( 4*registerCount + 3 ) );
                        GCM_FUNC_BUFFERED( cellGcmSetVertexProgramParameterBlock, rglGcmCurrent, parameterResource->resource, registerCount, ( float* )rglGcmCurrent );
                        rtParameter->pushBufferPointer = rglGcmCurrent - 4 * registerCount;
                     }
                     else
                     {
                        rtParameter->pushBufferPointer = programPushBuffer;
                        int resourceIndex = parameterResource->resource;
                        for ( int j = 0;j < arrayCount;j++, resourceIndex += registerStride )
                        {
                           //I use the programPushBuffer pointer so it's valid to have element in an array without any affectation
                           if ( program->resources[resourceIndex] != 0xffff )
                           {
                              memset( rglGcmCurrent, 0, 4*( 4*registerStride + 3 ) );
                              GCM_FUNC_BUFFERED( cellGcmSetVertexProgramParameterBlock, rglGcmCurrent, program->resources[resourceIndex], registerStride, ( float* )rglGcmCurrent ); // GCM_PORT_TESTED [KHOFF]
                              *( programPushBuffer++ ) = ( unsigned int* )( rglGcmCurrent - 4 * registerStride );
                           }
                           else
                           {
                              //This case is when there is an array item which is not referenced
                              //we still call tbe setter function, so we have to store the info somewhere...
                              //and we need to return the value previously set in case of the user asks for a get
                              *( programPushBuffer++ ) = ( unsigned int* )currentStorage;
                              currentStorage += 4 * registerStride;
                           }
                        }
                     }
                  }
                  else
                  {
                     rglGcmDriver *driver = rglGetGcmDriver();
                     if ( parameterEntry->flags & CGP_CONTIGUOUS )
                     {
                        rtParameter->pushBufferPointer = driver->sharedVPConstants + parameterResource->resource * 4 * sizeof( float );
                     }
                     else
                     {
                        int registerStride = isMatrix(( CGtype )parameterResource->type ) ? rglGetTypeRowCount(( CGtype )parameterResource->type ) : 1;
                        int registerCount = arrayCount * registerStride;
                        for ( int j = 0;j < registerCount;j += registerStride )
                        {
                           *programPushBuffer = ( unsigned int* )driver->sharedVPConstants + program->resources[parameterResource->resource+j] * 4 * sizeof( float );
                           rtParameter->pushBufferPointer = programPushBuffer++;
                        }
                     }
                  }
               }
               else //if (profileIndex == FRAGMENT_PROFILE_INDEX)
               {
                  if ( parameterEntry->flags & CGPF_SHARED )
                  {
                     // XXX needs an offset for the get
                     rtParameter->pushBufferPointer = NULL;
                  }
                  else
                  {
                     int registerStride = isMatrix(( CGtype )parameterResource->type ) ? rglGetTypeRowCount(( CGtype )parameterResource->type ) : 1;
                     int registerCount = arrayCount * registerStride;
                     rtParameter->pushBufferPointer = currentStorage;
                     currentStorage += 4 * registerCount;
                  }
               }

               switch ( parameterResource->type )
               {
                  case CG_FLOAT:
                  case CG_FLOAT1: case CG_FLOAT2: case CG_FLOAT3: case CG_FLOAT4:
                     // if this gets updated, don't forget the halfs below
                     {
                        unsigned int floatCount = rglCountFloatsInCgType(( CGtype )parameterResource->type );
                        rtParameter->setterIndex = setVectorTypeIndex[( containerEntry->flags&CGP_ARRAY ) ? 1 : 0][( containerEntry->flags&CGPF_SHARED ) ? 1 : 0][profileIndex][floatCount - 1];
                     }
                     break;
                  case CG_FLOAT1x1: case CG_FLOAT1x2: case CG_FLOAT1x3: case CG_FLOAT1x4:
                  case CG_FLOAT2x1: case CG_FLOAT2x2: case CG_FLOAT2x3: case CG_FLOAT2x4:
                  case CG_FLOAT3x1: case CG_FLOAT3x2: case CG_FLOAT3x3: case CG_FLOAT3x4:
                  case CG_FLOAT4x1: case CG_FLOAT4x2: case CG_FLOAT4x3: case CG_FLOAT4x4:
                     // if this gets updated, don't forget the halfs below
                     rtParameter->setterrIndex = setMatrixTypeIndex[( containerEntry->flags&CGP_ARRAY ) ? 1 : 0][( containerEntry->flags&CGPF_SHARED ) ? 1 : 0][profileIndex][rglGetTypeRowCount(( CGtype )parameterResource->type ) - 1][rglGetTypeColCount(( CGtype )parameterResource->type ) - 1][ROW_MAJOR];
                     rtParameter->settercIndex = setMatrixTypeIndex[( containerEntry->flags&CGP_ARRAY ) ? 1 : 0][( containerEntry->flags&CGPF_SHARED ) ? 1 : 0][profileIndex][rglGetTypeRowCount(( CGtype )parameterResource->type ) - 1][rglGetTypeColCount(( CGtype )parameterResource->type ) - 1][COL_MAJOR];
                     break;
                  case CG_SAMPLER1D: case CG_SAMPLER2D: case CG_SAMPLER3D: case CG_SAMPLERRECT: case CG_SAMPLERCUBE:
                     // A used sampler that does not have a TEXUNIT resource ?
                     // not sure if we ever go here.
                     break;
                  case CGP_SCF_BOOL:
                     break;
                  case CG_HALF:
                  case CG_HALF1: case CG_HALF2: case CG_HALF3: case CG_HALF4:
                  case CG_INT:
                  case CG_INT1: case CG_INT2: case CG_INT3: case CG_INT4:
                  case CG_BOOL:
                  case CG_BOOL1: case CG_BOOL2: case CG_BOOL3: case CG_BOOL4:
                  case CG_FIXED:
                  case CG_FIXED1: case CG_FIXED2: case CG_FIXED3: case CG_FIXED4:
                     {
                        unsigned int floatCount = rglCountFloatsInCgType(( CGtype )parameterResource->type );
                        rtParameter->setterIndex = setVectorTypeIndex[( containerEntry->flags&CGP_ARRAY ) ? 1 : 0][( containerEntry->flags&CGPF_SHARED ) ? 1 : 0][profileIndex][floatCount - 1];
                     }
                     break;
                  case CG_HALF1x1: case CG_HALF1x2: case CG_HALF1x3: case CG_HALF1x4:
                  case CG_HALF2x1: case CG_HALF2x2: case CG_HALF2x3: case CG_HALF2x4:
                  case CG_HALF3x1: case CG_HALF3x2: case CG_HALF3x3: case CG_HALF3x4:
                  case CG_HALF4x1: case CG_HALF4x2: case CG_HALF4x3: case CG_HALF4x4:
                  case CG_INT1x1: case CG_INT1x2: case CG_INT1x3: case CG_INT1x4:
                  case CG_INT2x1: case CG_INT2x2: case CG_INT2x3: case CG_INT2x4:
                  case CG_INT3x1: case CG_INT3x2: case CG_INT3x3: case CG_INT3x4:
                  case CG_INT4x1: case CG_INT4x2: case CG_INT4x3: case CG_INT4x4:
                  case CG_BOOL1x1: case CG_BOOL1x2: case CG_BOOL1x3: case CG_BOOL1x4:
                  case CG_BOOL2x1: case CG_BOOL2x2: case CG_BOOL2x3: case CG_BOOL2x4:
                  case CG_BOOL3x1: case CG_BOOL3x2: case CG_BOOL3x3: case CG_BOOL3x4:
                  case CG_BOOL4x1: case CG_BOOL4x2: case CG_BOOL4x3: case CG_BOOL4x4:
                  case CG_FIXED1x1: case CG_FIXED1x2: case CG_FIXED1x3: case CG_FIXED1x4:
                  case CG_FIXED2x1: case CG_FIXED2x2: case CG_FIXED2x3: case CG_FIXED2x4:
                  case CG_FIXED3x1: case CG_FIXED3x2: case CG_FIXED3x3: case CG_FIXED3x4:
                  case CG_FIXED4x1: case CG_FIXED4x2: case CG_FIXED4x3: case CG_FIXED4x4:
                     rtParameter->setterrIndex = setMatrixTypeIndex[( containerEntry->flags&CGP_ARRAY ) ? 1 : 0][( containerEntry->flags&CGPF_SHARED ) ? 1 : 0][profileIndex][rglGetTypeRowCount(( CGtype )parameterResource->type ) - 1][rglGetTypeColCount(( CGtype )parameterResource->type ) - 1][ROW_MAJOR];
                     rtParameter->settercIndex = setMatrixTypeIndex[( containerEntry->flags&CGP_ARRAY ) ? 1 : 0][( containerEntry->flags&CGPF_SHARED ) ? 1 : 0][profileIndex][rglGetTypeRowCount(( CGtype )parameterResource->type ) - 1][rglGetTypeColCount(( CGtype )parameterResource->type ) - 1][COL_MAJOR];
                     break;
                     // addition to be compatible with cgc 2.0
                  case CG_STRING:
                     break;
                  default:
                     break;
               }
            }
         }
         else if (( parameterEntry->flags & CGPV_MASK ) == CGPV_VARYING )
         {
            if (( parameterEntry->flags & CGPD_MASK ) == CGPD_IN && profileIndex == VERTEX_PROFILE_INDEX )
            {
               rtParameter->setterIndex = setAttribConstantIndex;
            }
         }
      }
      else
      {
         if (( parameterEntry->flags & CGPV_MASK ) == CGPV_UNIFORM )
         {
            if ( parameterEntry->flags & CGP_ARRAY )
               continue;

            const CgParameterResource *parameterResource = rglGetParameterResource( program, parameterEntry );
            // we silently ignore valid sets on unused parameters.
            switch ( parameterResource->type )
            {
               case CG_FLOAT:
               case CG_FLOAT1: case CG_FLOAT2: case CG_FLOAT3: case CG_FLOAT4:
                  //rtParameter->setter = _cgIgnoreSetParam;
                  rtParameter->setterIndex = _cgIgnoreSetParamIndex;
                  break;
               case CG_FLOAT1x1: case CG_FLOAT1x2: case CG_FLOAT1x3: case CG_FLOAT1x4:
               case CG_FLOAT2x1: case CG_FLOAT2x2: case CG_FLOAT2x3: case CG_FLOAT2x4:
               case CG_FLOAT3x1: case CG_FLOAT3x2: case CG_FLOAT3x3: case CG_FLOAT3x4:
               case CG_FLOAT4x1: case CG_FLOAT4x2: case CG_FLOAT4x3: case CG_FLOAT4x4:
                  //rtParameter->setterr = _cgIgnoreSetParam;
                  //rtParameter->setterc = _cgIgnoreSetParam;
                  rtParameter->setterrIndex = _cgIgnoreSetParamIndex;
                  rtParameter->settercIndex = _cgIgnoreSetParamIndex;
                  break;
               case CG_SAMPLER1D: case CG_SAMPLER2D: case CG_SAMPLER3D: case CG_SAMPLERRECT: case CG_SAMPLERCUBE:
                  rtParameter->samplerSetter = _cgIgnoreSetParamIndex;
                  break;
               case CGP_SCF_BOOL:
                  break;
               case CG_HALF:
               case CG_HALF1: case CG_HALF2: case CG_HALF3: case CG_HALF4:
               case CG_INT:
               case CG_INT1: case CG_INT2: case CG_INT3: case CG_INT4:
               case CG_BOOL:
               case CG_BOOL1: case CG_BOOL2: case CG_BOOL3: case CG_BOOL4:
               case CG_FIXED:
               case CG_FIXED1: case CG_FIXED2: case CG_FIXED3: case CG_FIXED4:
                  //rtParameter->setter = _cgIgnoreSetParam;
                  rtParameter->setterIndex = _cgIgnoreSetParamIndex;
                  break;
               case CG_HALF1x1: case CG_HALF1x2: case CG_HALF1x3: case CG_HALF1x4:
               case CG_HALF2x1: case CG_HALF2x2: case CG_HALF2x3: case CG_HALF2x4:
               case CG_HALF3x1: case CG_HALF3x2: case CG_HALF3x3: case CG_HALF3x4:
               case CG_HALF4x1: case CG_HALF4x2: case CG_HALF4x3: case CG_HALF4x4:
               case CG_INT1x1: case CG_INT1x2: case CG_INT1x3: case CG_INT1x4:
               case CG_INT2x1: case CG_INT2x2: case CG_INT2x3: case CG_INT2x4:
               case CG_INT3x1: case CG_INT3x2: case CG_INT3x3: case CG_INT3x4:
               case CG_INT4x1: case CG_INT4x2: case CG_INT4x3: case CG_INT4x4:
               case CG_BOOL1x1: case CG_BOOL1x2: case CG_BOOL1x3: case CG_BOOL1x4:
               case CG_BOOL2x1: case CG_BOOL2x2: case CG_BOOL2x3: case CG_BOOL2x4:
               case CG_BOOL3x1: case CG_BOOL3x2: case CG_BOOL3x3: case CG_BOOL3x4:
               case CG_BOOL4x1: case CG_BOOL4x2: case CG_BOOL4x3: case CG_BOOL4x4:
               case CG_FIXED1x1: case CG_FIXED1x2: case CG_FIXED1x3: case CG_FIXED1x4:
               case CG_FIXED2x1: case CG_FIXED2x2: case CG_FIXED2x3: case CG_FIXED2x4:
               case CG_FIXED3x1: case CG_FIXED3x2: case CG_FIXED3x3: case CG_FIXED3x4:
               case CG_FIXED4x1: case CG_FIXED4x2: case CG_FIXED4x3: case CG_FIXED4x4:
                  //rtParameter->setterr = _cgIgnoreSetParam;
                  //rtParameter->setterc = _cgIgnoreSetParam;
                  rtParameter->setterrIndex = _cgIgnoreSetParamIndex;
                  rtParameter->settercIndex = _cgIgnoreSetParamIndex;
                  break;
                  // addition to be compatible with cgc 2.0
               case CG_STRING:
                  break;
               default:
                  break;
            }
         }
         else if (( parameterEntry->flags & CGPV_MASK ) == CGPV_VARYING )
         {
            if (( parameterEntry->flags & CGPD_MASK ) == CGPD_IN && profileIndex == VERTEX_PROFILE_INDEX )
            {
               rtParameter->setterIndex = setAttribConstantIndex;
            }
         }
      }
      arrayCount = 1;
      containerEntry = NULL;
   }

   //add padding
   if ( bufferSize > 0 )
   {
      int nopCount = ( program->constantPushBuffer + bufferSize ) - ( unsigned int * )rglGcmCurrent;
      GCM_FUNC_BUFFERED( cellGcmSetNopCommand, rglGcmCurrent, nopCount ); // GCM_PORT_TESTED [KHOFF]
   }
}

//this function sets the embedded constant to their default value in the ucode of a fragment shader
//it's called at setup time right after loading the program. this function could be removed if the
//default values were already in the shader code
void rglSetDefaultValuesVP( _CGprogram *program )
{

   int count = program->defaultValuesIndexCount;
   for ( int i = 0;i < count;i++ )
   {
      int index = ( int )program->defaultValuesIndices[i].entryIndex;
      CgRuntimeParameter *rtParameter = program->runtimeParameters + index;

      int arrayCount = 1;
      const CgParameterEntry *parameterEntry = rtParameter->parameterEntry;
      bool isArray = false;
      if ( parameterEntry->flags & CGP_ARRAY )
      {
         isArray = true;
         const CgParameterArray *parameterArray = rglGetParameterArray( program, parameterEntry );
         arrayCount = rglGetSizeofSubArray( parameterArray->dimensions, parameterArray->dimensionCount );
         parameterEntry++;
         rtParameter++;
      }

      if ( rtParameter->pushBufferPointer ) //unreferenced might have default values
      {
         const CgParameterResource *parameterResource = rglGetParameterResource( program, parameterEntry );
         const float *itemDefaultValues = program->defaultValues + program->defaultValuesIndices[i].defaultValueIndex;
         int registerStride = isMatrix(( CGtype )parameterResource->type ) ? rglGetTypeRowCount(( CGtype )parameterResource->type ) : 1;
         if ( parameterEntry->flags & CGP_CONTIGUOUS )
            memcpy( rtParameter->pushBufferPointer, itemDefaultValues, arrayCount * registerStride *4*sizeof( float ) );
         else
         {
            unsigned int *pushBufferPointer = (( unsigned int * )rtParameter->pushBufferPointer );
            for ( int j = 0;j < arrayCount;j++ )
            {
               unsigned int *pushBufferAddress = isArray ? ( *( unsigned int** )pushBufferPointer ) : pushBufferPointer;
               memcpy( pushBufferAddress, itemDefaultValues, registerStride*4*sizeof( float ) );
               pushBufferPointer += isArray ? 1 : 3 + registerStride * 4;
               itemDefaultValues += 4 * registerStride;
            }
         }
      }
   }
}

void rglSetDefaultValuesFP( _CGprogram *program )
{
   int count = program->defaultValuesIndexCount;
   for ( int i = 0;i < count;i++ )
   {
      const void * __restrict pItemDefaultValues = program->defaultValues + program->defaultValuesIndices[i].defaultValueIndex;
      const unsigned int * itemDefaultValues = ( const unsigned int * )pItemDefaultValues;
      int index = ( int )program->defaultValuesIndices[i].entryIndex;

      CgRuntimeParameter *rtParameter = program->runtimeParameters + index;
      float *hostMemoryCopy = ( float * )rtParameter->pushBufferPointer;

      if ( hostMemoryCopy ) //certain parameter are not referenced but still have a default value.
      {
         const CgParameterEntry *parameterEntry = rtParameter->parameterEntry;
         int arrayCount = 1;
         if ( parameterEntry->flags & CGP_ARRAY )
         {
            const CgParameterArray *parameterArray = rglGetParameterArray( program, parameterEntry );
            arrayCount = rglGetSizeofSubArray( parameterArray->dimensions, parameterArray->dimensionCount );
            i++;
            parameterEntry++;
         }
         const CgParameterResource *parameterResource = rglGetParameterResource( program, parameterEntry );
         unsigned short *resource = program->resources + parameterResource->resource + 1; //+1 to skip the register
         int registerStride = isMatrix(( CGtype )parameterResource->type ) ? rglGetTypeRowCount(( CGtype )parameterResource->type ) : 1;
         int registerCount = arrayCount * registerStride;
         int j;
         for ( j = 0;j < registerCount;j++ )
         {
            unsigned short embeddedConstCount = *( resource++ );
            int k;
            for ( k = 0;k < embeddedConstCount;k++ )
            {
               unsigned short ucodePatchOffset = *( resource )++;
               unsigned int *dst = ( unsigned int* )(( char* )program->ucode + ucodePatchOffset );
               dst[0] = SWAP_IF_BIG_ENDIAN( itemDefaultValues[0] );
               dst[1] = SWAP_IF_BIG_ENDIAN( itemDefaultValues[1] );
               dst[2] = SWAP_IF_BIG_ENDIAN( itemDefaultValues[2] );
               dst[3] = SWAP_IF_BIG_ENDIAN( itemDefaultValues[3] );
            }
            memcpy(( void* )hostMemoryCopy, ( void* )itemDefaultValues, sizeof( float )*4 );
            hostMemoryCopy += 4;
            itemDefaultValues += 4;
            resource++; //skip the register of the next item
         }
      }
   }
}

/*============================================================
  PLATFORM BUFFER
  ============================================================ */

static void rglDeallocateBuffer( rglBufferObject* bufferObject )
{
   rglGcmBufferObject *rglBuffer = ( rglGcmBufferObject * )bufferObject->platformBufferObject;

   switch ( rglBuffer->pool )
   {
      case RGLGCM_SURFACE_POOL_LINEAR:
         gmmFree( rglBuffer->bufferId );
         break;
      case RGLGCM_SURFACE_POOL_NONE:
         break;
      default:
         break;
   }

   rglBuffer->pool = RGLGCM_SURFACE_POOL_NONE;
   rglBuffer->bufferId = GMM_ERROR;
}

static void rglpsAllocateBuffer(rglBufferObject* bufferObject)
{
   rglGcmBufferObject *rglBuffer = ( rglGcmBufferObject * )bufferObject->platformBufferObject;

   // free current buffer (if any)
   rglDeallocateBuffer( bufferObject );

   // allocate in GPU memory
   rglBuffer->pool = RGLGCM_SURFACE_POOL_LINEAR;
   rglBuffer->bufferId = gmmAlloc((CellGcmContextData*)&rglGcmState_i.fifo,
         CELL_GCM_LOCATION_LOCAL, 0, rglBuffer->bufferSize);
   rglBuffer->pitch = 0;

   if ( rglBuffer->bufferId == GMM_ERROR )
      rglBuffer->pool = RGLGCM_SURFACE_POOL_NONE;

   GLuint referenceCount = bufferObject->textureReferences.getCount();
   if ( referenceCount > 0 )
   {
      for ( GLuint i = 0;i < referenceCount;++i )
      {
         rglTexture *texture = bufferObject->textureReferences[i];
         rglGcmTexture *gcmTexture = ( rglGcmTexture * )texture->platformTexture;
         gcmTexture->gpuAddressId = rglBuffer->bufferId;
         gcmTexture->gpuAddressIdOffset = texture->offset;
         texture->revalidate |= RGL_TEXTURE_REVALIDATE_PARAMETERS;
         rglTextureTouchFBOs( texture );
      }
      _CurrentContext->needValidate |= RGL_VALIDATE_TEXTURES_USED;
   }
}

int rglpBufferObjectSize(void) { return sizeof(rglGcmBufferObject); }

GLboolean rglpCreateBufferObject( rglBufferObject* bufferObject )
{
   rglGcmBufferObject *rglBuffer = ( rglGcmBufferObject * )bufferObject->platformBufferObject;

   rglBuffer->pool = RGLGCM_SURFACE_POOL_NONE;
   rglBuffer->bufferId = GMM_ERROR;
   rglBuffer->mapCount = 0;
   rglBuffer->mapAccess = GL_NONE;

   // allocate initial buffer
   rglBuffer->bufferSize = rglPad( bufferObject->size, RGL_BUFFER_OBJECT_BLOCK_SIZE );
   rglpsAllocateBuffer( bufferObject );

   return rglBuffer->bufferId != GMM_ERROR;
}

void rglPlatformDestroyBufferObject( rglBufferObject* bufferObject )
{
   rglDeallocateBuffer( bufferObject );
}

void rglPlatformBufferObjectSetData( rglBufferObject* bufferObject, GLintptr offset, GLsizeiptr size, const GLvoid *data, GLboolean tryImmediateCopy )
{
   rglGcmBufferObject *rglBuffer = ( rglGcmBufferObject * )bufferObject->platformBufferObject;

   if ( size == bufferObject->size && tryImmediateCopy )
   {
      memcpy( gmmIdToAddress( rglBuffer->bufferId ) + offset, data, size );
   }
   else
      if ( size >= bufferObject->size )
      {
         // reallocate the buffer
         //  To avoid waiting for the GPU to finish with the buffer, just
         //  allocate a whole new one.
         rglBuffer->bufferSize = rglPad( size, RGL_BUFFER_OBJECT_BLOCK_SIZE );
         rglpsAllocateBuffer( bufferObject );

         // copy directly to newly allocated memory
         //  TODO: For GPU destination, should we copy to system memory and
         //  pull from GPU?
         switch ( rglBuffer->pool )
         {
            case RGLGCM_SURFACE_POOL_NONE:
               rglSetError( GL_OUT_OF_MEMORY );
               return;
            default:
               memcpy( gmmIdToAddress( rglBuffer->bufferId ), data, size );
               break;
         }
      }
      else
      {
         if ( tryImmediateCopy )
         {
            memcpy( gmmIdToAddress( rglBuffer->bufferId ) + offset, data, size );
         }
         else
         {
            // partial buffer write
            //  STREAM and DYNAMIC buffers get transfer via a bounce buffer.
            // copy via bounce buffer
            rglGcmSend( rglBuffer->bufferId, offset, rglBuffer->pitch, ( const char * )data, size );
         }
      }

   // be conservative here. Whenever we write to any Buffer Object, invalidate the vertex cache
   rglGetGcmDriver()->invalidateVertexCache = GL_TRUE;
}

GLvoid rglPlatformBufferObjectCopyData(
      rglBufferObject* bufferObjectDst,
      rglBufferObject* bufferObjectSrc )
{
   rglGcmBufferObject* dst = ( rglGcmBufferObject* )bufferObjectDst->platformBufferObject;
   rglGcmBufferObject* src = ( rglGcmBufferObject* )bufferObjectSrc->platformBufferObject;

   // copy data
   //  There is currently no requirement to copy from one pool to another.

   rglGcmMemcpy( dst->bufferId, 0, dst->pitch, src->bufferId, 0, src->bufferSize );

   // be conservative here. Whenever we write to any Buffer Object, invalidate the vertex cache
   rglGetGcmDriver()->invalidateVertexCache = GL_TRUE;
}

char *rglPlatformBufferObjectMap( rglBufferObject* bufferObject, GLenum access )
{
   rglGcmBufferObject *rglBuffer = ( rglGcmBufferObject * )bufferObject->platformBufferObject;

   if ( rglBuffer->mapCount++ == 0 )
   {
      if ( access == GL_WRITE_ONLY )
      {
         // replace entire buffer
         //  To avoid waiting for the GPU to finish using the buffer,
         //  just allocate a new one.
         rglpsAllocateBuffer( bufferObject );
         if ( rglBuffer->pool == RGLGCM_SURFACE_POOL_NONE )
         {
            rglSetError( GL_OUT_OF_MEMORY );
            return NULL;
         }
      }
      else
         rglpFifoGlFinish(); // must wait in order to read

      rglBuffer->mapAccess = access;

      // count writable mapped buffers
      //  If any buffers are left mapped when a draw is invoked, we must
      //  flush the vertex cache in case VBO data has been modified.
      if ( rglBuffer->mapAccess != GL_READ_ONLY )
      {
         rglGcmDriver *driver = rglGetGcmDriver();
         ++driver->flushBufferCount;
      }

      // only need to pin the first time we map
      gmmPinId( rglBuffer->bufferId );
   }

   return gmmIdToAddress( rglBuffer->bufferId );
}

GLboolean rglPlatformBufferObjectUnmap( rglBufferObject* bufferObject )
{
   // can't unmap if not mapped
   rglGcmBufferObject *rglBuffer = ( rglGcmBufferObject * )bufferObject->platformBufferObject;

   if ( --rglBuffer->mapCount == 0 )
   {
      // count writable mapped buffers
      //  If any buffers are left mapped when a draw is invoked, we must
      //  flush the vertex cache in case VBO data has been modified.
      if ( rglBuffer->mapAccess != GL_READ_ONLY )
      {
         rglGcmDriver *driver = rglGetGcmDriver();
         --driver->flushBufferCount;

         // make sure we flush for the next draw
         driver->invalidateVertexCache = GL_TRUE;
      }

      rglBuffer->mapAccess = GL_NONE;

      gmmUnpinId( rglBuffer->bufferId );
   }

   return GL_TRUE;
}

/*============================================================
  PLATFORM FRAME BUFFER OPERATIONS
  ============================================================ */

void rglFBClear( GLbitfield mask )
{
   RGLcontext*	LContext = _CurrentContext;

   rglGcmDriver *driver = rglGetGcmDriver();
   if ( !driver->rtValid ) return;

   GLbitfield newmask = 0;
   if (( mask & GL_COLOR_BUFFER_BIT ) && driver->rt.colorBufferCount ) newmask |= RGLGCM_COLOR_BUFFER_BIT;

   if ( !newmask ) return;

   GLbitfield clearMask = newmask;
   if ( driver->rt.colorFormat != RGLGCM_ARGB8 ) clearMask &= ~RGLGCM_COLOR_BUFFER_BIT;

   // always use quad clear for colors with MRT
   //  There is one global clear mask for all render targets.  This doesn't
   //  work nicely for all render target combinations, e.g. only the first
   //  and last targets enabled.  Quad clear works because color mask is
   //  per target.
   //
   //  TODO: Clear could be used if the enabled render targets are
   //  contiguous from 0, i.e. {0,1}, {0,1,2}, {0,1,2,3}.  If this is done,
   //  parallel changes need to made in rglValidateWriteMask because we
   //  bypass calling rglGcmFifoGlColorMask there and the mask used by nv_glClear
   //  is not updated.
   if ( driver->rt.colorBufferCount > 1 )
      clearMask &= ~RGLGCM_COLOR_BUFFER_BIT;

   if ( clearMask )
   {
      rglGcmFifoGlClearColor( 0, 0, 0, 0);

      rglGcmFifoGlClear( clearMask );
      newmask &= ~clearMask;
   }

   if ( newmask )
   {
      // draw a quad to erase everything.

      // disable/set up a lot of states
      //

      static float rglClearVertexBuffer[4*3] __attribute__(( aligned( RGL_ALIGN_FAST_TRANSFER ) ) ) =
      {
         -1.f, -1.f, 0.f,
         -1.f, 1.f, 0.f,
         1.f, -1.f, 0.f,
         1.f, 1.f, 0.f,
      };

      GLuint bufferId = gmmAlloc((CellGcmContextData*)&rglGcmState_i.fifo,
            CELL_GCM_LOCATION_LOCAL, 0, sizeof(rglClearVertexBuffer));

      memcpy( gmmIdToAddress(bufferId), rglClearVertexBuffer, sizeof( rglClearVertexBuffer ) );
      rglGcmFifoGlVertexAttribPointer( 0, 3, RGLGCM_FLOAT, RGLGCM_FALSE, 3*sizeof( GLfloat ), 1, 0, gmmIdToOffset(bufferId) );
      RGLBIT_TRUE( LContext->attribs->DirtyMask, 0 );

      for ( int i = 1;i < RGL_MAX_VERTEX_ATTRIBS;++i )
      {
         rglGcmFifoGlVertexAttribPointer( i, 0, RGLGCM_FLOAT, 0, 0, 0, 0, 0 );
         RGLBIT_TRUE( LContext->attribs->DirtyMask, i );
      }
      int clearcolor = 0;
      rglGcmFifoGlVertexAttrib4fv( RGL_ATTRIB_PRIMARY_COLOR_INDEX, ( GLfloat* )&clearcolor );

      LContext->needValidate |= RGL_VALIDATE_WRITE_MASK | RGL_VALIDATE_FRAGMENT_PROGRAM;

      gmmFree( bufferId );
   }

   rglGcmFifoGlFlush();
}

/*============================================================
  PLATFORM FRAMEBUFFER
  ============================================================ */

rglFramebuffer* rglCreateFramebuffer( void )
{
   rglFramebuffer* framebuffer = new rglPlatformFramebuffer();
   return framebuffer;
}

void rglDestroyFramebuffer( rglFramebuffer* framebuffer )
{
   delete framebuffer;
}

GLenum rglPlatformFramebufferCheckStatus( rglFramebuffer* framebuffer )
{
   RGLcontext* LContext = _CurrentContext;

   GLuint nBuffers = 0;	// number of attached buffers
   int width = 0;
   int height = 0;
   GLboolean sizeMismatch = GL_FALSE;

   // record attached images
   //  We have to verify that no image is attached more than once.  The
   //  array is sized for color attachments plus depth and stencil.
   rglImage* image[RGL_MAX_COLOR_ATTACHMENTS + 2] = {0};

   // test colors
   GLuint colorFormat = 0;
   for ( int i = 0; i < RGL_MAX_COLOR_ATTACHMENTS; ++i )
   {
      rglTexture* colorTexture = NULL;
      GLuint colorFace = 0;
      rglFramebufferGetAttachmentTexture(
            LContext,
            &framebuffer->color[i],
            &colorTexture,
            &colorFace );
      // TODO: Complete texture may not be required.
      if ( colorTexture != NULL )
      {
         if ( colorTexture->referenceBuffer && !colorTexture->isRenderTarget )
         {
            //RGL_REPORT_EXTRA( RGL_REPORT_FRAMEBUFFER_UNSUPPORTED, "Framebuffer color attachment texture is a reference in a format that cannot be rendered to (swizzled texture, smaller than 16x16 or with more than 32 bits per pixel)" );
            return GL_FRAMEBUFFER_UNSUPPORTED_OES;
         }

         // all attachments must have the same dimensions
         image[nBuffers] = colorTexture->image;
         if (( width && width != image[nBuffers]->width ) ||
               ( height && height != image[nBuffers]->height ) )
            sizeMismatch = GL_TRUE;

         width = image[nBuffers]->width;
         height = image[nBuffers]->height;

         // all color attachments need the same format
         if ( colorFormat && colorFormat != image[nBuffers]->internalFormat )
         {
            //RGL_REPORT_EXTRA( RGL_REPORT_FRAMEBUFFER_INCOMPLETE, "Framebuffer attachments have inconsistent color formats" );
            return GL_FRAMEBUFFER_INCOMPLETE_FORMATS_OES;
         }
         colorFormat = image[nBuffers]->internalFormat;

         ++nBuffers;
      }
   }

   // check for supported color format
   if ( nBuffers )
   {
      if ( !rglIsDrawableColorFormat( colorFormat ) )
      {
         //RGL_REPORT_EXTRA( RGL_REPORT_FRAMEBUFFER_UNSUPPORTED, "Color attachment to framebuffer must be a supported drawable format (GL_ARGB_SCE, GL_RGB16F_ARB, GL_RGBA16F_ARB, GL_RGB32F_ARB, GL_RGBA32F_ARB, GL_LUMINANCE32F_ARB)" );
         return GL_FRAMEBUFFER_UNSUPPORTED_OES;
      }
      switch ( colorFormat )
      {
         case RGLGCM_ARGB8:
         case RGLGCM_RGB5_A1_SCE:
         case RGLGCM_RGB565_SCE:
         case RGLGCM_FLOAT_R32:
            break;
         default:
            return GL_FRAMEBUFFER_UNSUPPORTED_OES;
      }
   }

   // at least once attachment is required
   if ( nBuffers == 0 )
      return GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT_OES;

   // verify no image is attached more than once
   //  This is an n-squared algorithm - n*log(n) is possible but
   //  probably not necessary (or even faster).
   for ( GLuint i = 0; i < nBuffers; ++i )
      for ( GLuint j = i + 1; j < nBuffers; ++j )
         if ( image[i] == image[j] )
            return GL_FRAMEBUFFER_INCOMPLETE_DUPLICATE_ATTACHMENT_OES;

   return GL_FRAMEBUFFER_COMPLETE_OES;
}

void rglPlatformFramebuffer::validate( RGLcontext *LContext )
{
   complete = ( rglPlatformFramebufferCheckStatus( this ) == GL_FRAMEBUFFER_COMPLETE_OES );
   if ( !complete ) return;

   GLuint width = RGLGCM_MAX_RT_DIMENSION;
   GLuint height = RGLGCM_MAX_RT_DIMENSION;

   GLuint xscale = 1;
   GLuint yscale = 1;

   // color
   rt.colorBufferCount = 0;
   rt.colorFormat = RGLGCM_NONE;
   GLuint defaultPitch = 0;
   GLuint defaultId = GMM_ERROR;
   GLuint defaultIdOffset = 0;
   for ( int i = 0; i < RGLGCM_SETRENDERTARGET_MAXCOUNT; ++i )
   {
      // get the texture and face
      rglTexture* colorTexture = NULL;
      GLuint face = 0;
      rglFramebufferGetAttachmentTexture( LContext, &color[i], &colorTexture, &face );
      if ( colorTexture == NULL ) continue;

      rglGcmTexture* nvTexture = ( rglGcmTexture * )colorTexture->platformTexture;

      // make sure texture is resident in a supported layout
      //  Some restrictions are added if a texture is used as a
      //  render target:
      //
      //  - no swizzled semifat or fat formats
      //  - no swizzled smaller than 16x16
      //  - no mipmapped cube maps in tiled memory
      //  - no cube maps with height not a multiple of 16 in tiled
      //    memory
      //
      //  We may need to reallocate the texture if any of these
      //  are true.
      //
      //  TODO: Measure time spent here and optimize if indicated.
      if ( !colorTexture->isRenderTarget )
      {
         colorTexture->isRenderTarget = GL_TRUE;
         colorTexture->revalidate |= RGL_TEXTURE_REVALIDATE_LAYOUT;
      }
      rglPlatformValidateTextureResources( colorTexture );
      colorTexture->image->dataState = RGL_IMAGE_DATASTATE_GPU;

      // set the render target
      rt.colorId[i] = nvTexture->gpuAddressId;
      rt.colorIdOffset[i] = nvTexture->gpuAddressIdOffset + rglGetGcmImageOffset( &nvTexture->gpuLayout, face, 0 );
      rt.colorPitch[i] = nvTexture->gpuLayout.pitch ? nvTexture->gpuLayout.pitch : nvTexture->gpuLayout.pixelBits * nvTexture->gpuLayout.baseWidth / 8;

      width = MIN( width, nvTexture->gpuLayout.baseWidth );
      height = MIN( height, nvTexture->gpuLayout.baseHeight );
      rt.colorFormat = nvTexture->gpuLayout.internalFormat;
      rt.colorBufferCount = i + 1;
      defaultId = rt.colorId[i];
      defaultIdOffset = rt.colorIdOffset[i];
      defaultPitch = rt.colorPitch[i];
   }

   // framebuffer dimensions are the intersection of attachments
   rt.width = width / xscale;
   rt.height = height / yscale;

   rt.yInverted = RGLGCM_FALSE;
   rt.xOffset = 0;
   rt.yOffset = 0;
   needValidate = GL_FALSE;
}

// set render targets
void rglValidateFramebuffer( void )
{
   RGLdevice *LDevice = _CurrentDevice;
   rglGcmDevice *gcmDevice = ( rglGcmDevice * )LDevice->platformDevice;

   RGLcontext* LContext = _CurrentContext;
   rglGcmDriver * gcmDriver = rglGetGcmDriver();

   // reset buffer data
   gcmDriver->rtValid = GL_FALSE;
   // get buffer parameters
   //  This may come from a framebuffer_object or the default framebuffer.
   if ( LContext->framebuffer )
   {
      rglPlatformFramebuffer* framebuffer = static_cast<rglPlatformFramebuffer *>( rglGetFramebuffer( LContext, LContext->framebuffer ) );

      if ( framebuffer->needValidate ) framebuffer->validate( LContext );

      gcmDriver->rt = framebuffer->rt;
   }
   else	// use default framebuffer
      gcmDriver->rt = gcmDevice->rt;

   gcmDriver->rtValid = GL_TRUE;

   // update GPU configuration
   rglGcmFifoGlSetRenderTarget( &gcmDriver->rt );

   LContext->needValidate &= ~RGL_VALIDATE_FRAMEBUFFER;
   LContext->needValidate |= RGL_VALIDATE_VIEWPORT | RGL_VALIDATE_SCISSOR_BOX
      | RGL_VALIDATE_WRITE_MASK;
}

/*============================================================
  PLATFORM RASTER
  ============================================================ */

#define RGL_ATTRIB_BUFFER_ALIGNMENT 16

// maximum size for drawing data
#define RGLGCM_MAX_VERTEX_BUFFER_SIZE (2 << 20)
#define RGLGCM_MAX_INDEX_BUFFER_SIZE (1 << 20)

// Initialize the driver and setup the fixed function pipeline
// shader and needed connections between GL state and the shader
void *rglPlatformRasterInit()
{
   rglpFifoGlFinish();
   rglGcmDriver *driver = ( rglGcmDriver * )malloc( sizeof( rglGcmDriver ) );
   memset( driver, 0, sizeof( rglGcmDriver ) );

   driver->rt.yInverted = RGLGCM_TRUE;

   driver->invalidateVertexCache = GL_FALSE;

   driver->flushBufferCount = 0;

   // [YLIN] Make it 16 byte align
   driver->sharedVPConstants = ( char * )memalign( 16, 4 * sizeof( float ) * RGL_MAX_VP_SHARED_CONSTANTS );
   driver->sharedFPConstantsId = gmmAlloc((CellGcmContextData*)&rglGcmState_i.fifo,
         CELL_GCM_LOCATION_LOCAL, 0, 4 * sizeof(float) * RGL_MAX_FP_SHARED_CONSTANTS);
   return driver;
}

// Destroy the driver, and free all its used memory
void rglPlatformRasterExit( void *drv )
{
   rglGcmDriver *driver = ( rglGcmDriver * )drv;

   gmmFree( driver->sharedFPConstantsId );
   free( driver->sharedVPConstants );

   if ( driver )
      free( driver );
}

/**
 *  @}
 */

///////////////////////////////////////////////////////////////////////////
void rglDumpFifo( char * name );
extern bool _cellRSXFifoDisassembleToFileMask;

// Fast rendering path called by several glDraw calls:
//   glDrawElements, glDrawRangeElements, glDrawArrays
// Slow rendering calls this function also, though it must also perform various
// memory setup operations first
void rglPlatformDraw( rglDrawParams* dparams )
{
   rglGcmDriver *driver = rglGetGcmDriver();
   if (RGL_UNLIKELY(!driver->rtValid))
      return;

   // check for any writable mapped buffers
   if ( driver->flushBufferCount != 0 )
      driver->invalidateVertexCache = GL_TRUE;

   GLboolean isMain = 0;
   GLuint gpuOffset = GMM_ERROR;

   uint32_t totalXfer = 0;
   for ( GLuint i = 0; i < RGL_MAX_VERTEX_ATTRIBS; ++i )
      totalXfer += dparams->attribXferSize[i];

   gpuOffset = rglValidateAttributesSlow( dparams, &isMain );

   if ( driver->invalidateVertexCache )
   {
      driver->invalidateVertexCache = GL_FALSE;
      GCM_FUNC_NO_ARGS( cellGcmSetInvalidateVertexCache );
   }

   // if fpLoadProgramId is in main memory need set the CELL_GCM_LOCATION_MAIN
   if ( gmmIdIsMain(driver->fpLoadProgramId) )
   {
      GCM_FUNC( cellGcmSetUpdateFragmentProgramParameterLocation,
            gmmIdToOffset( driver->fpLoadProgramId ) + driver->fpLoadProgramOffset, CELL_GCM_LOCATION_MAIN );
   }
   else
   {
      GCM_FUNC( cellGcmSetUpdateFragmentProgramParameter,
            gmmIdToOffset( driver->fpLoadProgramId ) + driver->fpLoadProgramOffset );
   }


   // glDrawArrays()
   rglGcmFifoGlDrawArrays(( rglGcmEnum )dparams->mode, dparams->firstVertex, dparams->vertexCount );
}


// Set up the current fragment program on hardware
void rglValidateFragmentProgram()
{
   RGLcontext*	LContext = _CurrentContext;
   rglGcmDriver *driver = rglGetGcmDriver();
   _CGprogram *program = LContext->BoundFragmentProgram;

   // params are set directly in the GPU memory, so there is nothing to be done here.
   rglSetNativeCgFragmentProgram( program );
   driver->fpLoadProgramId = program->loadProgramId;
   driver->fpLoadProgramOffset = program->loadProgramOffset;
}

// must always call this before rglPlatformDraw() to setup rglDrawParams
GLboolean rglPlatformRequiresSlowPath( rglDrawParams* dparams, const GLenum indexType, uint32_t indexCount)
{
   RGLcontext* LContext = _CurrentContext;
   rglAttributeState* as = LContext->attribs;

   // are any enabled attributes on the client-side?
   const GLuint clientSideMask = as->EnabledMask & ~as->HasVBOMask;
   if ( RGL_UNLIKELY( clientSideMask ) )
   {
      // determine transfer buffer requirements for client-side attributes
      for ( int i = 0; i < RGL_MAX_VERTEX_ATTRIBS; ++i )
      {
         if ( clientSideMask & ( 1 << i ) )
         {
            rglAttribute* attrib = as->attrib + i;
            const GLuint freq = attrib->frequency;
            GLuint count = ( (dparams->firstVertex + dparams->vertexCount) + freq - 1 ) / freq;

            const GLuint numBytes = attrib->clientStride * count;
            dparams->attribXferOffset[i] = dparams->xferTotalSize;
            dparams->attribXferSize[i] = numBytes;

            const GLuint numBytesPadded = rglPad( numBytes, 128 );
            dparams->xferTotalSize += numBytesPadded;
            dparams->attribXferTotalSize += numBytesPadded;
         }
         else
         {
            dparams->attribXferOffset[i] = 0;
            dparams->attribXferSize[i] = 0;
         }
      }
   }

   return GL_FALSE;	// we are finally qualified for the fast path
}

// Return the current RGLGcmDriver
rglGcmDriver* rglGetRGLGcmDriver()
{
   return ( rglGcmDriver * )( _CurrentDevice->rasterDriver );
}

void rglPlatformRasterFlush()
{
   rglGcmFifoGlFlush();
}

void rglpFifoGlFinish( void )
{
   GCM_FUNC_NO_ARGS( cellGcmSetInvalidateVertexCache );
   rglGcmFifoFinish( &rglGcmState_i.fifo );
}

// validates attributes for specified draw paramaters
// returns pointer to index buffer
GLuint rglValidateAttributesSlow( rglDrawParams *dparams, GLboolean *isMain )
{
   RGLcontext*	LContext = _CurrentContext;
   rglGcmDriver *driver = rglGetGcmDriver();
   rglAttributeState* as = LContext->attribs;

   // allocate upload transfer buffer if necessary
   //  The higher level bounce buffer allocator is used, which means that
   //  the buffer will automatically be freed after all RGLGCM calls up to
   //  the next allocation have finished.
   void* xferBuffer = NULL;
   GLuint xferId = GMM_ERROR;
   GLuint VBOId = GMM_ERROR;
   GLuint gpuOffset;
   if ( RGL_UNLIKELY( dparams->xferTotalSize ) )
   {
      xferId = gmmAlloc((CellGcmContextData*)&rglGcmState_i.fifo,
            CELL_GCM_LOCATION_LOCAL, 0, dparams->xferTotalSize);
      xferBuffer = gmmIdToAddress(xferId);
   }


   // which attributes are known to need updating?
   // (due to being dirty or enabled client-side arrays)
   rglBitfield needsUpdateMask = ( as->DirtyMask | ( as->EnabledMask & ~as->HasVBOMask ) );

   // for any remaining attributes that need updating, do it now.
   if ( needsUpdateMask )
   {
      for ( GLuint i = 0; i < RGL_MAX_VERTEX_ATTRIBS; ++i )
      {
         // skip this attribute if not needing update
         if ( ! RGLBIT_GET( needsUpdateMask, i ) ) continue;

         rglAttribute* attrib = as->attrib + i;
         if ( RGLBIT_GET( as->EnabledMask, i ) )
         {
            const GLsizei stride = attrib->clientStride;
            const GLuint freq = attrib->frequency;

            if ( RGL_UNLIKELY( dparams->attribXferSize[i] ) )
            {
               // attribute data is client side, need to transfer

               // don't transfer data that is not going to be used, from 0 to first*stride
               GLuint offset = ( dparams->firstVertex / freq ) * stride;

               char * b = ( char * )xferBuffer + dparams->attribXferOffset[i];
               memcpy( b + offset,
                     ( char * )attrib->clientData + offset,
                     dparams->attribXferSize[i] - offset );

               // draw directly from bounce buffer
               *isMain = gmmIdIsMain(xferId);
               gpuOffset = gmmIdToOffset(xferId) + (b - ( char * )xferBuffer);

            }
            else
            {
               // attribute data in VBO, clientData is offset.
               VBOId = rglGcmGetBufferObjectOrigin( attrib->arrayBuffer );
               *isMain = gmmIdIsMain(VBOId);
               gpuOffset = gmmIdToOffset(VBOId)
                  + (( const GLubyte* )attrib->clientData - ( const GLubyte* )NULL );
            }

            rglGcmFifoGlVertexAttribPointer( i, attrib->clientSize,
                  ( rglGcmEnum )attrib->clientType, attrib->normalized,
                  stride, freq, *isMain, gpuOffset );
         }
         else
         {
            // attribute is disabled
            rglGcmFifoGlVertexAttribPointer( i, 0, RGLGCM_FLOAT, 0, 0, 0, 0, 0 );
            rglGcmFifoGlVertexAttrib4fv( i, attrib->value );
         }
      }
      driver->invalidateVertexCache = GL_TRUE;
   }
   as->DirtyMask = 0;	// all attributes are now clean

   // validate index buffer
   GLuint indexOffset = 0;

   if ( xferId != GMM_ERROR )
      gmmFree( xferId );

   return indexOffset;
}

/*============================================================
  PLATFORM TEXTURE
  ============================================================ */

// Calculate required size in bytes for given texture layout
GLuint rglGetGcmTextureSize( rglGcmTextureLayout *layout )
{
   GLuint bytesNeeded = 0;
   GLuint faceAlign = layout->pitch ? 1 : 128;

   GLuint minWidth = 1;
   GLuint minHeight = 1;
   for ( GLuint face = 0;face < layout->faces;++face )
   {
      GLuint width = layout->baseWidth;
      GLuint height = layout->baseHeight;
      GLuint depth = layout->baseDepth;
      for ( GLuint i = 0;i < layout->levels; ++i )
      {
         width = MAX( minWidth, width );
         height = MAX( minHeight, height );
         depth = MAX( 1U, depth );

         if ( !layout->pitch )
            bytesNeeded += layout->pixelBits * width * height * depth / 8;
         else
            bytesNeeded += height * depth * layout->pitch;

         width >>= 1;
         height >>= 1;
         depth >>= 1;
      }

      bytesNeeded = rglPad( bytesNeeded, faceAlign );
   }

   return bytesNeeded;
}

// Calculate byte offset for given texture layout, number of faces, and levels
GLuint rglGetGcmImageOffset( rglGcmTextureLayout *layout, GLuint face, GLuint level )
{
   GLuint bytes = 0;
   GLuint faceAlign = layout->pitch ? 1 : 128;

   GLuint minWidth = 1;
   GLuint minHeight = 1;

   for ( GLuint f = 0;f < layout->faces;++f )
   {
      GLuint width = layout->baseWidth;
      GLuint height = layout->baseHeight;
      GLuint depth = layout->baseDepth;
      for ( GLuint i = 0;i < layout->levels; ++i )
      {
         if (( level == i ) && ( face == f ) ) return bytes;

         width = MAX( minWidth, width );
         height = MAX( minHeight, height );
         depth = MAX( 1U, depth );

         if ( !layout->pitch )
            bytes += layout->pixelBits * width * height * depth / 8;
         else
            bytes += height * depth * layout->pitch;

         width >>= 1;
         height >>= 1;
         depth >>= 1;
      }

      bytes = rglPad( bytes, faceAlign );
   }
   return 0;
}
// Get size of a texture
int rglPlatformTextureSize()
{
   return sizeof( rglGcmTexture );
}

// Calculate pitch for a texture
static GLuint _getTexturePitch( const rglTexture * texture )
{
   return rglPad( rglGetStorageSize( texture->image->format, texture->image->type, texture->image->width, 1, 1 ), 64 ); // TransferVid2Vid needs 64byte pitch alignment
}

// Return maximum number of texture image units
int rglPlatformTextureMaxUnits()
{
   return RGLGCM_MAX_TEXIMAGE_COUNT;
}

// Create a gcm texture by initializing memory to 0
void rglPlatformCreateTexture( rglTexture* texture )
{
   rglGcmTexture *gcmTexture = ( rglGcmTexture * )texture->platformTexture;
   memset( gcmTexture, 0, sizeof( rglGcmTexture ) );
   gcmTexture->gpuAddressId = GMM_ERROR;
}


void rglPlatformFreeGcmTexture( rglTexture* texture );

// Destroy a texture by freeing a gcm texture and an associated PBO
void rglPlatformDestroyTexture( rglTexture* texture )
{
   if ( !texture->referenceBuffer )
      rglPlatformFreeGcmTexture( texture );

   rglTextureTouchFBOs( texture );
}

// Get size of texture in GPU layout
inline static GLuint rglPlatformTextureGetGPUSize( const rglTexture* texture )
{
   rglGcmTexture *gcmTexture = ( rglGcmTexture * )texture->platformTexture;
   return rglGetGcmTextureSize( &gcmTexture->gpuLayout );
}

// Drop a texture from the GPU memory by detaching it from a PBO
void rglPlatformDropTexture( rglTexture *texture )
{
   rglGcmTexture *gcmTexture = ( rglGcmTexture * )texture->platformTexture;

   if (gcmTexture->pool != RGLGCM_SURFACE_POOL_NONE)
   {
      rglPlatformFreeGcmTexture( texture );
   }

   gcmTexture->pool = RGLGCM_SURFACE_POOL_NONE;
   gcmTexture->gpuAddressId = GMM_ERROR;
   gcmTexture->gpuAddressIdOffset = 0;
   gcmTexture->gpuSize = 0;
   texture->revalidate |= RGL_TEXTURE_REVALIDATE_IMAGES;
   rglTextureTouchFBOs( texture );
}

// Drop unbound textures from the GPU memory
// This is kind of slow, but we hit a slow path anyway.
//  If the pool argument is not RGLGCM_SURFACE_POOL_NONE, then only textures
//  in the specified pool will be dropped.
void rglPlatformDropUnboundTextures( GLenum pool )
{
   RGLcontext*	LContext = _CurrentContext;
   GLuint i, j;
   for ( i = 0;i < LContext->textureNameSpace.capacity;++i )
   {
      GLboolean bound = GL_FALSE;
      rglTexture *texture = ( rglTexture * )LContext->textureNameSpace.data[i];
      if ( !texture || ( texture->referenceBuffer != 0 ) ) continue;

      // check if bound
      for ( j = 0;j < RGL_MAX_TEXTURE_IMAGE_UNITS;++j )
      {
         rglTextureImageUnit *tu = LContext->TextureImageUnits + j;
         if ( tu->bound2D == i)
         {
            bound = GL_TRUE;
            break;
         }
      }
      if ( bound )
         continue;

      rglGcmTexture *gcmTexture = ( rglGcmTexture * )texture->platformTexture;

      // check pool
      if ( pool != RGLGCM_SURFACE_POOL_NONE &&
            pool != gcmTexture->pool )
         continue;

      rglPlatformDropTexture( texture );
   }
}

// Drop filitering mode for FP32 texture
static inline GLenum unFilter( GLenum filter )
{
   GLenum newFilter;
   switch ( filter )
   {
      case GL_NEAREST:
      case GL_LINEAR:
         newFilter = GL_NEAREST;
         break;
      case GL_NEAREST_MIPMAP_NEAREST:
      case GL_NEAREST_MIPMAP_LINEAR:
      case GL_LINEAR_MIPMAP_NEAREST:
      case GL_LINEAR_MIPMAP_LINEAR:
         newFilter = GL_NEAREST_MIPMAP_NEAREST;
         break;
      default:
         newFilter = GL_NEAREST;
   }
   return newFilter;
}

// Choose a texture layout and store it to newLayout, based on texture's filtering mode, swizzling, and size
void rglPlatformChooseGPUFormatAndLayout(
      const rglTexture* texture,
      GLboolean forceLinear,
      GLuint pitch,
      rglGcmTextureLayout* newLayout )
{
   rglImage *image = texture->image + texture->baseLevel;

   GLuint levels = rglLog2( MAX( MAX( image->width, image->height ), image->depth ) ) + 1;
   levels = MIN( levels, texture->maxLevel + 1 );

   // if we don't want mipmapping, but still have valid mipmaps, load all the mipmaps.
   // This is to avoid a big cost when switching from mipmaps to non-mipmaps.
   if (( texture->minFilter == GL_LINEAR ) || ( texture->minFilter == GL_NEAREST ) )
   {
      levels = 1;
   }

   newLayout->levels = levels;
   newLayout->faces = texture->faceCount;
   newLayout->baseWidth = image->width;
   newLayout->baseHeight = image->height;
   newLayout->baseDepth = image->depth;
   newLayout->internalFormat = ( rglGcmEnum )image->internalFormat;
   newLayout->pixelBits = rglPlatformGetBitsPerPixel( newLayout->internalFormat );
   newLayout->pitch = pitch ? pitch : _getTexturePitch( texture );
}

// texture strategy actions
//  A texture strategy is a sequence of actions represented by these tokens.
//  RGL_TEXTURE_STRATEGY_END must be the last token in any strategy.
enum rglTextureStrategy {
   RGL_TEXTURE_STRATEGY_END,			// allocation failed, give up
   RGL_TEXTURE_STRATEGY_FORCE_LINEAR,
   RGL_TEXTURE_STRATEGY_TILED_ALLOC,
   RGL_TEXTURE_STRATEGY_TILED_CLEAR,
   RGL_TEXTURE_STRATEGY_UNTILED_ALLOC,
   RGL_TEXTURE_STRATEGY_UNTILED_CLEAR,
   RGL_TEXTURE_STRATEGY_SYSTEM_ALLOC,
   RGL_TEXTURE_STRATEGY_SYSTEM_CLEAR,	// XXX probably not useful
};

static enum rglTextureStrategy tiledGPUStrategy[] =
{
   RGL_TEXTURE_STRATEGY_TILED_ALLOC,	// try tiled alloction
   RGL_TEXTURE_STRATEGY_FORCE_LINEAR,
   RGL_TEXTURE_STRATEGY_UNTILED_ALLOC,	// if failure, try linear allocation
   RGL_TEXTURE_STRATEGY_UNTILED_CLEAR,	// if failure, drop linear textures
   RGL_TEXTURE_STRATEGY_UNTILED_ALLOC,	// try linear again
   RGL_TEXTURE_STRATEGY_END,			// give up
};
static enum rglTextureStrategy linearGPUStrategy[] =
{
   RGL_TEXTURE_STRATEGY_FORCE_LINEAR,
   RGL_TEXTURE_STRATEGY_UNTILED_ALLOC,
   RGL_TEXTURE_STRATEGY_UNTILED_CLEAR,
   RGL_TEXTURE_STRATEGY_UNTILED_ALLOC,
   RGL_TEXTURE_STRATEGY_END,
};
static enum rglTextureStrategy swizzledGPUStrategy[] =
{
   RGL_TEXTURE_STRATEGY_UNTILED_ALLOC,
   RGL_TEXTURE_STRATEGY_UNTILED_CLEAR,
   RGL_TEXTURE_STRATEGY_UNTILED_ALLOC,
   RGL_TEXTURE_STRATEGY_END,
};
static enum rglTextureStrategy linearSystemStrategy[] =
{
   RGL_TEXTURE_STRATEGY_FORCE_LINEAR,
   RGL_TEXTURE_STRATEGY_SYSTEM_ALLOC,
   RGL_TEXTURE_STRATEGY_END,
};
static enum rglTextureStrategy swizzledSystemStrategy[] =
{
   RGL_TEXTURE_STRATEGY_SYSTEM_ALLOC,
   RGL_TEXTURE_STRATEGY_END,
};

// Reallocate texture based on usage, pool system, and strategy
void rglPlatformReallocateGcmTexture( rglTexture* texture )
{
   rglGcmTexture *gcmTexture = ( rglGcmTexture * )texture->platformTexture;

   // select the allocation strategy
   enum rglTextureStrategy *step = NULL;
   switch ( texture->usage )
   {
      case GL_TEXTURE_TILED_GPU_SCE:
         step = tiledGPUStrategy;
         break;
      case GL_TEXTURE_LINEAR_GPU_SCE:
         step = linearGPUStrategy;
         break;
      case GL_TEXTURE_SWIZZLED_GPU_SCE:
         step = swizzledGPUStrategy;
         break;
      case GL_TEXTURE_LINEAR_SYSTEM_SCE:
         step = linearSystemStrategy;
         break;
      case GL_TEXTURE_SWIZZLED_SYSTEM_SCE:
         step = swizzledSystemStrategy;
         break;
      default:
         step = swizzledGPUStrategy;
         break;
   }

   GLuint size = 0;
   GLuint id = GMM_ERROR;

   // allow swizzled format unless explicitly disallowed
   //  PBO textures cannot be swizzled.
   GLboolean forceLinear = GL_FALSE;

   const rglGcmTextureLayout currentLayout = gcmTexture->gpuLayout;
   const GLuint currentSize = gcmTexture->gpuSize;

   // process strategy
   GLboolean done = GL_FALSE;
   while ( !done )
   {
      rglGcmTextureLayout newLayout;

      switch ( *step++ )
      {
         case RGL_TEXTURE_STRATEGY_FORCE_LINEAR:
            forceLinear = GL_TRUE;
            break;
         case RGL_TEXTURE_STRATEGY_UNTILED_ALLOC:
            // get layout and size compatible with this pool
            rglPlatformChooseGPUFormatAndLayout( texture, forceLinear, 0, &newLayout );
            size = rglGetGcmTextureSize( &newLayout );

            // determine if current allocation already works
            //  If the current allocation has the right size and pool, we
            //  don't have to do anything.  If not, we only drop from the
            //  target pool because we may reuse the allocation from a
            //  different pool in a later step.
            if ( gcmTexture->pool == RGLGCM_SURFACE_POOL_LINEAR )
            {
               if ( currentSize >= size && newLayout.pitch == currentLayout.pitch )
               {
                  gcmTexture->gpuLayout = newLayout;
                  done = GL_TRUE;
               }
               else
                  rglPlatformDropTexture( texture );
            }

            if ( !done )
            {
               // allocate in the specified pool
               id = gmmAlloc((CellGcmContextData*)&rglGcmState_i.fifo,
                     CELL_GCM_LOCATION_LOCAL,
                     0,
                     size);
               if ( id != GMM_ERROR )
               {
                  // drop old allocation
                  if ( gcmTexture->pool != RGLGCM_SURFACE_POOL_NONE )
                     rglPlatformDropTexture( texture );

                  // set new
                  gcmTexture->pool = RGLGCM_SURFACE_POOL_LINEAR;
                  gcmTexture->gpuAddressId = id;
                  gcmTexture->gpuAddressIdOffset = 0;
                  gcmTexture->gpuSize = size;
                  gcmTexture->gpuLayout = newLayout;

                  done = GL_TRUE;
               }
            }
            break;
         case RGL_TEXTURE_STRATEGY_UNTILED_CLEAR:
            rglPlatformDropUnboundTextures( RGLGCM_SURFACE_POOL_LINEAR );
            break;
         case RGL_TEXTURE_STRATEGY_END:
            rglSetError( GL_OUT_OF_MEMORY );
            done = GL_TRUE;
            break;
         default:
            break;
      }
   } // while loop for allocation strategy steps
   rglTextureTouchFBOs( texture );
}

// Free memory pooled by a GCM texture
void rglPlatformFreeGcmTexture( rglTexture* texture )
{
   rglGcmTexture *gcmTexture = ( rglGcmTexture * )texture->platformTexture;
   switch ( gcmTexture->pool )
   {
      case RGLGCM_SURFACE_POOL_LINEAR:
         gmmFree( gcmTexture->gpuAddressId );
         break;
      case RGLGCM_SURFACE_POOL_SYSTEM:
         gmmFree( gcmTexture->gpuAddressId );
         break;
      case RGLGCM_SURFACE_POOL_TILED_COLOR:
         rglGcmFreeTiledSurface( gcmTexture->gpuAddressId );
         break;
      case RGLGCM_SURFACE_POOL_NONE:
         break;
   }


   gcmTexture->gpuAddressId = GMM_ERROR;
   gcmTexture->gpuAddressIdOffset = 0;
   gcmTexture->gpuSize = 0;
}

// Upload texure from host memory to GPU memory
void rglPlatformUploadTexture( rglTexture* texture )
{
   rglGcmTexture *gcmTexture = ( rglGcmTexture * )texture->platformTexture;
   rglGcmTextureLayout *layout = &gcmTexture->gpuLayout;

   const GLuint pixelBytes = layout->pixelBits / 8;

   // host texture requires sync

   // create surface descriptors for image transfer
   rglGcmSurface src = {
source:		RGLGCM_SURFACE_SOURCE_TEMPORARY,
            width:		0,		// replaced per image
            height:		0,		// replaced per image
            bpp:		pixelBytes,
            pitch:		0,		// replaced per image
            format:		layout->internalFormat,
            pool:		RGLGCM_SURFACE_POOL_LINEAR,		// via bounce buffer
            ppuData:     NULL,   // replaced per image
            dataId:      GMM_ERROR,
            dataIdOffset:0,
   };

   rglGcmSurface dst = {
source:		RGLGCM_SURFACE_SOURCE_TEXTURE,
            width:		0,		// replaced per image
            height:		0,		// replaced per image
            bpp:		pixelBytes,
            pitch:		layout->pitch,
            format:		layout->internalFormat,
            pool:		gcmTexture->pool,
            ppuData:     NULL,   // replaced per image
            dataId:      GMM_ERROR,
            dataIdOffset:0,
   };

   // use a bounce buffer to transfer to GPU
   GLuint bounceBufferId = GMM_ERROR;
   {
      {
         // check if upload is needed for this image
         rglImage *image = texture->image;
         if ( image->dataState == RGL_IMAGE_DATASTATE_HOST )
         {
            // determine image offset from base address
            // TODO: compute all offsets at once for efficiency
            //  This is the offset in bytes for this face/image from the
            //  texture base address.
            const GLuint dataOffset = rglGetGcmImageOffset( layout, 0, 0 );

            // set source pixel buffer
            src.ppuData = image->data;

            // lazy allocation of bounce buffer
            if ( bounceBufferId == GMM_ERROR && layout->baseDepth == 1 )
               bounceBufferId = gmmAlloc((CellGcmContextData*)&rglGcmState_i.fifo,
                     CELL_GCM_LOCATION_LOCAL, 0, gcmTexture->gpuSize);

            if ( bounceBufferId != GMM_ERROR )
            {
               // copy image to bounce buffer
               src.dataId = bounceBufferId;
               src.dataIdOffset = dataOffset;

               // NPOT DXT
               memcpy( gmmIdToAddress( src.dataId ) + dataOffset,
                     image->data,
                     image->storageSize );
            }

            {
               // use surface copy functions
               src.width = image->width;
               src.height = image->height;
               src.pitch = pixelBytes * src.width;

               dst.width = src.width;
               dst.height = image->height;
               dst.dataId = gcmTexture->gpuAddressId;
               dst.dataIdOffset = gcmTexture->gpuAddressIdOffset + dataOffset;

               GLuint offsetHeight = 0;
               if ( dst.pitch )
               {
                  // linear (not swizzled)
                  //  The tiled linear format requires that render
                  //  targets be aligned to 8*pitch from the start of
                  //  the tiled region.
                  offsetHeight = ( dataOffset / dst.pitch ) % 8;
                  dst.height += offsetHeight;
                  dst.dataIdOffset -= offsetHeight * dst.pitch;
               }

               rglGcmCopySurface(
                     &src, 0, 0,
                     &dst, 0, offsetHeight,
                     src.width, src.height,
                     GL_TRUE );	// don't bypass GPU pipeline
            }

            // free CPU copy of data
            rglImageFreeCPUStorage( image );
            image->dataState |= RGL_IMAGE_DATASTATE_GPU;
         } // newer data on host
      } // loop over levels
   } // loop over faces

   if ( bounceBufferId != GMM_ERROR )
      gmmFree( bounceBufferId );

   rglGcmFifoGlInvalidateTextureCache();
}

// map RGL internal types to GCM
static inline void rglGcmUpdateGcmTexture( rglTexture * texture, rglGcmTextureLayout * layout, rglGcmTexture * platformTexture )
{
   // use color format for depth with no compare mode
   //  This hack is needed because the hardware will not read depth
   //  textures without performing a compare.  The depth value will need to
   //  be reconstructed in the shader from the color components.
   GLuint internalFormat = layout->internalFormat;

   // set the format and remap( control 1)
   rglGcmMapTextureFormat( internalFormat,
         platformTexture->gcmTexture.format, platformTexture->gcmTexture.remap );

   // This is just to cover the conversion from swizzled to linear
   if ( layout->pitch )
   {
      platformTexture->gcmTexture.format += 0x20; // see class doc definitions for SZ_NR vs LN_NR...
   }

   platformTexture->gcmTexture.width = layout->baseWidth;
   platformTexture->gcmTexture.height = layout->baseHeight;
   platformTexture->gcmTexture.depth = layout->baseDepth;
   platformTexture->gcmTexture.pitch = layout->pitch;

   platformTexture->gcmTexture.mipmap = layout->levels;

   // set the dimention and cubmap settings
   // default is false
   platformTexture->gcmTexture.cubemap = CELL_GCM_FALSE;

   // set dimension, swizzled implies P2 width/height/depth
   switch ( texture->target )
   {
      case 0:
      case RGLGCM_TEXTURE_2D:
         platformTexture->gcmTexture.dimension = CELL_GCM_TEXTURE_DIMENSION_2;
         break;
   }

   // system or local texture
   platformTexture->gcmTexture.location = CELL_GCM_LOCATION_LOCAL;

}

// map RGL internal types to GCM
void rglGcmUpdateMethods( rglTexture * texture )
{

   rglGcmTexture *platformTexture = ( rglGcmTexture * )texture->platformTexture;
   rglGcmTextureLayout *layout = &platformTexture->gpuLayout;

   // max aniso
   // revalidate the texture registers cache.
   int maxAniso = ( int )texture->maxAnisotropy;
   GLuint minFilter = texture->minFilter;
   GLuint magFilter = texture->magFilter;

   // XXX make sure that REVALIDATE_PARAMETERS is set if the format of the texture changes
   // revalidate the texture registers cache just to ensure we are in the correct filtering mode
   // based on the internal format.
   switch ( layout->internalFormat )
   {
      case RGLGCM_FLOAT_R32:
         // float 32 doesn't support filtering
         minFilter = unFilter( minFilter );
         magFilter = unFilter( magFilter );
         maxAniso = 1;
         break;
      default:
         break;
   }

   // -----------------------------------------------------------------------
   // map the SET_TEXTURE_FILTER method.
   // set the min
   platformTexture->gcmMethods.filter.min = rglGcmMapMinTextureFilter( minFilter );
   // set the mag
   platformTexture->gcmMethods.filter.mag = rglGcmMapMagTextureFilter( magFilter );
   // set the QuinConx by default
   platformTexture->gcmMethods.filter.conv = CELL_GCM_TEXTURE_CONVOLUTION_QUINCUNX;
   // We don't actually expose this, but still need to set it up properly incase we expose this later
   // hw expects a 5.8 twos-complement fixed-point // XXX  what is the - .26f ?
   platformTexture->gcmMethods.filter.bias = ( GLint )(( texture->lodBias - .26f ) * 256.0f );

   // -----------------------------------------------------------------------
   // set the SET_TEXTURE_CONTROL0 params
   platformTexture->gcmMethods.control0.maxAniso = rglGcmMapAniso( maxAniso );
   const GLfloat minLOD = MAX( texture->minLod, texture->baseLevel );
   const GLfloat maxLOD = MIN( texture->maxLod, texture->maxLevel );
   platformTexture->gcmMethods.control0.minLOD = ( GLuint )( MAX( minLOD, 0 ) * 256.0f );
   platformTexture->gcmMethods.control0.maxLOD = ( GLuint )( MIN( maxLOD, layout->levels ) * 256.0f );

   // -----------------------------------------------------------------------
   // set the SET_TEXTURE_ADDRESS method params.
   platformTexture->gcmMethods.address.wrapS = rglGcmMapWrapMode( texture->wrapS );
   platformTexture->gcmMethods.address.wrapT = rglGcmMapWrapMode( texture->wrapT );
   platformTexture->gcmMethods.address.wrapR = rglGcmMapWrapMode( texture->wrapR );
   platformTexture->gcmMethods.address.unsignedRemap = CELL_GCM_TEXTURE_UNSIGNED_REMAP_NORMAL;

   // now for gamma remap
   GLuint gamma = 0;
   GLuint remap = texture->gammaRemap;
   gamma |= ( remap & RGLGCM_GAMMA_REMAP_RED_BIT ) 		? CELL_GCM_TEXTURE_GAMMA_R : 0;
   gamma |= ( remap & RGLGCM_GAMMA_REMAP_GREEN_BIT )	? CELL_GCM_TEXTURE_GAMMA_G : 0;
   gamma |= ( remap & RGLGCM_GAMMA_REMAP_BLUE_BIT ) 	? CELL_GCM_TEXTURE_GAMMA_B : 0;
   gamma |= ( remap & RGLGCM_GAMMA_REMAP_ALPHA_BIT ) 	? CELL_GCM_TEXTURE_GAMMA_A : 0;

   platformTexture->gcmMethods.address.gamma = gamma;

   // set border colors
   RGLGCM_CALC_COLOR_LE_ARGB8(&(platformTexture->gcmMethods.borderColor),
         texture->borderColor.R, texture->borderColor.G, texture->borderColor.B, texture->borderColor.A);

   // -----------------------------------------------------------------------
   // setup the GcmTexture
   // format, control1, control3, imagerect; setup for cellGcmSetTexture later
   rglGcmUpdateGcmTexture( texture, layout, platformTexture );

}

// Validate texture resources
void rglPlatformValidateTextureResources( rglTexture *texture )
{
   texture->isComplete = GL_TRUE;

   //  We may need to reallocate the texture when the parameters are changed
   //  from non-mipmap to mipmap filtering, even though the images have not
   //  changed.
   //
   //  NOTE: If we ever support accessing mipmaps from a PBO, this code
   //  must be changed.  As it is, if the texture has a shared PBO and the
   //  mipmap flag then the slow path (copy back to host) is invoked.
   if ( texture->revalidate & RGL_TEXTURE_REVALIDATE_IMAGES || texture->revalidate & RGL_TEXTURE_REVALIDATE_LAYOUT )
   {
      // upload images
      rglPlatformReallocateGcmTexture( texture );
      rglPlatformUploadTexture( texture );
   }

   // gcmTexture method command
   rglGcmUpdateMethods( texture );

   texture->revalidate = 0;
}

// [YLIN] We are going to use gcm macro directly!
#include <cell/gcm/gcm_method_data.h>

// Set a texture to a gcm texture unit
static inline void rglGcmSetTextureUnit( GLuint unit, rglGcmTexture *platformTexture )
{
   const GLuint imageOffset = gmmIdToOffset(platformTexture->gpuAddressId) + platformTexture->gpuAddressIdOffset;
   platformTexture->gcmTexture.offset = imageOffset;

   // set up the texture unit with the info for the current texture
   // bind texture , control 1,3,format and remap
   // [YLIN] Contiguous GCM calls in fact will cause LHSs between them
   //  There is no walkaround for this but not to use GCM functions

   GCM_FUNC_SAFE( cellGcmSetTexture, unit, &platformTexture->gcmTexture );
   CellGcmContextData *gcm_context = (CellGcmContextData*)&rglGcmState_i.fifo;
   cellGcmReserveMethodSizeInline(gcm_context, 11);
   uint32_t *current = gcm_context->current;
   current[0] = CELL_GCM_METHOD_HEADER_TEXTURE_OFFSET(unit, 8);
   current[1] = CELL_GCM_METHOD_DATA_TEXTURE_OFFSET(platformTexture->gcmTexture.offset);
   current[2] = CELL_GCM_METHOD_DATA_TEXTURE_FORMAT(platformTexture->gcmTexture.location,
         platformTexture->gcmTexture.cubemap,
         platformTexture->gcmTexture.dimension,
         platformTexture->gcmTexture.format,
         platformTexture->gcmTexture.mipmap);
   current[3] = CELL_GCM_METHOD_DATA_TEXTURE_ADDRESS( platformTexture->gcmMethods.address.wrapS,
         platformTexture->gcmMethods.address.wrapT,
         platformTexture->gcmMethods.address.wrapR,
         platformTexture->gcmMethods.address.unsignedRemap,
         platformTexture->gcmMethods.address.zfunc,
         platformTexture->gcmMethods.address.gamma,
         0);
   current[4] = CELL_GCM_METHOD_DATA_TEXTURE_CONTROL0(CELL_GCM_TRUE,
         platformTexture->gcmMethods.control0.minLOD,
         platformTexture->gcmMethods.control0.maxLOD,
         platformTexture->gcmMethods.control0.maxAniso);
   current[5] = platformTexture->gcmTexture.remap;
   current[6] = CELL_GCM_METHOD_DATA_TEXTURE_FILTER(
         (platformTexture->gcmMethods.filter.bias & 0x1fff),
         platformTexture->gcmMethods.filter.min,
         platformTexture->gcmMethods.filter.mag,
         platformTexture->gcmMethods.filter.conv);
   current[7] = CELL_GCM_METHOD_DATA_TEXTURE_IMAGE_RECT(
         platformTexture->gcmTexture.height,
         platformTexture->gcmTexture.width);
   current[8] = CELL_GCM_METHOD_DATA_TEXTURE_BORDER_COLOR(
         platformTexture->gcmMethods.borderColor);
   current[9] = CELL_GCM_METHOD_HEADER_TEXTURE_CONTROL3(unit,1);
   current[10] = CELL_GCM_METHOD_DATA_TEXTURE_CONTROL3(
         platformTexture->gcmTexture.pitch,
         platformTexture->gcmTexture.depth);
   gcm_context->current = &current[11];


}

// Validate incomplete texture by remapping
inline static void rglPlatformValidateIncompleteTexture( GLuint unit )
{

   GLuint remap = CELL_GCM_REMAP_MODE(
         CELL_GCM_TEXTURE_REMAP_ORDER_XYXY,
         CELL_GCM_TEXTURE_REMAP_FROM_A,
         CELL_GCM_TEXTURE_REMAP_FROM_R,
         CELL_GCM_TEXTURE_REMAP_FROM_G,
         CELL_GCM_TEXTURE_REMAP_FROM_B,
         CELL_GCM_TEXTURE_REMAP_ONE,
         CELL_GCM_TEXTURE_REMAP_ZERO,
         CELL_GCM_TEXTURE_REMAP_ZERO,
         CELL_GCM_TEXTURE_REMAP_ZERO );

   // disable control 0
   GCM_FUNC( cellGcmSetTextureControl, unit, CELL_GCM_FALSE, 0, 0, 0 );
   // set texture remap only
   GCM_FUNC( cellGcmSetTextureRemap, unit, remap );

}

#undef RGLGCM_REMAP_MODES

// Valiate resources of a texture and set it to a unit
void rglPlatformValidateTextureStage( int unit, rglTexture* texture )
{
   if ( RGL_UNLIKELY( texture->revalidate ) )
   {
      // this updates the isComplete bit.
      rglPlatformValidateTextureResources( texture );
   }

   GLboolean isCompleteCache = texture->isComplete;

   if ( RGL_LIKELY( isCompleteCache ) )
   {
      rglGcmTexture *platformTexture = ( rglGcmTexture * )texture->platformTexture;
      rglGcmSetTextureUnit( unit, platformTexture );
   }
   else
   {
      //RGL_REPORT_EXTRA( RGL_REPORT_TEXTURE_INCOMPLETE, "Texture %d bound to unit %d(%s) is incomplete.", texture->name, unit, rglGetGLEnumName( texture->target ) );
      rglPlatformValidateIncompleteTexture( unit );
   }


}

// Choose internal format closest to given format
GLenum rglPlatformChooseInternalFormat( GLenum internalFormat )
{
   switch ( internalFormat )
   {
      case GL_ALPHA12:
      case GL_ALPHA16:
         return RGLGCM_ALPHA16;
      case GL_ALPHA:
      case GL_ALPHA4:
         return RGLGCM_ALPHA8;
      case GL_R3_G3_B2:
      case GL_RGB4:
      case GL_RGB:
      case GL_RGB8:
      case RGLGCM_RGBX8:
         return RGLGCM_RGBX8;
      case GL_RGBA2:
      case GL_RGBA4:
      case GL_RGBA8:
      case GL_RGBA:
         return RGLGCM_RGBA8;
      case GL_RGB5_A1:
         return RGLGCM_RGB5_A1_SCE;
      case GL_RGB5:
         return RGLGCM_RGB565_SCE;
      case GL_BGRA:
      case RGLGCM_BGRA8:
         return RGLGCM_BGRA8;
      case GL_ARGB_SCE:
         return RGLGCM_ARGB8;
      case GL_RGBA32F_ARB:
      default:
         return GL_INVALID_ENUM;
   }
   return GL_INVALID_ENUM;
}

// Expand internal format to format and type
void rglPlatformExpandInternalFormat( GLenum internalFormat, GLenum *format, GLenum *type )
{
   switch ( internalFormat )
   {
      case RGLGCM_ALPHA16:
         *format = GL_ALPHA;
         *type = GL_UNSIGNED_SHORT;
         break;
      case RGLGCM_ALPHA8:
         *format = GL_ALPHA;
         *type = GL_UNSIGNED_BYTE;
         break;
      case RGLGCM_RGBX8:
         *format = GL_RGBA;
         *type = GL_UNSIGNED_INT_8_8_8_8;
         break;
      case RGLGCM_RGBA8:
         *format = GL_RGBA;
         *type = GL_UNSIGNED_INT_8_8_8_8;
         break;
      case RGLGCM_ARGB8:
         *format = GL_BGRA;
         *type = GL_UNSIGNED_INT_8_8_8_8_REV;
         break;
      case RGLGCM_BGRA8:
         *format = GL_BGRA;
         *type = GL_UNSIGNED_INT_8_8_8_8;
         break;
      case RGLGCM_RGB5_A1_SCE:
         *format = GL_RGBA;
         *type = GL_UNSIGNED_SHORT_1_5_5_5_REV;
         break;
      case RGLGCM_RGB565_SCE:
         *format = GL_RGB;
         *type = GL_UNSIGNED_SHORT_5_6_5_REV;
         break;
      default:
         return;
   }
}

// Choose internal storage type and size, and set it to image, based on given format
GLenum rglPlatformChooseInternalStorage( rglImage* image, GLenum internalFormat )
{
   // see note at bottom concerning storageSize
   image->storageSize = 0;

   GLenum platformInternalFormat = rglPlatformChooseInternalFormat( internalFormat );

   if ( platformInternalFormat == GL_INVALID_ENUM ) return GL_INVALID_ENUM;
   image->internalFormat = platformInternalFormat;
   rglPlatformExpandInternalFormat( platformInternalFormat, &image->format, &image->type );

   // Note that it is critical to get the storageSize value correct because
   // this member is used to configure texture loads and unloads.  If this
   // value is wrong (e.g. contains unnecessary padding) it will corrupt
   // the GPU memory layout.
   image->storageSize = rglGetStorageSize(
         image->format, image->type,
         image->width, image->height, image->depth );

   return GL_NO_ERROR;
}

// Translate platform-specific format to GL enum
GLenum rglPlatformTranslateTextureFormat( GLenum internalFormat )
{
   switch ( internalFormat )
   {
      case RGLGCM_RGBX8:
         return GL_RGBA8;
      default:	// same as GL
         return internalFormat;
   }
}

// Implementation of texture reference
// Associate bufferObject to texture by assigning buffer's gpu address to the gcm texture
GLboolean rglPlatformTextureReference( rglTexture *texture, GLuint pitch, rglBufferObject *bufferObject, GLintptr offset )
{
   rglGcmTexture *gcmTexture = ( rglGcmTexture * )texture->platformTexture;

   // XXX check pitch restrictions ?

   rglGcmTextureLayout newLayout;
   rglPlatformChooseGPUFormatAndLayout( texture, true, pitch, &newLayout );


   GLboolean isRenderTarget = GL_FALSE;
   GLboolean vertexEnable = GL_FALSE;
   // can usually be a render target, except for restrictions below
   if ( rglIsDrawableColorFormat( newLayout.internalFormat ))
   {
      isRenderTarget = GL_TRUE;
   }

   switch ( newLayout.internalFormat )
   {
      case GL_FLOAT_RGBA32:
      case GL_RGBA32F_ARB:
         vertexEnable = GL_TRUE;
         break;
      default:
         break;
   }

   texture->isRenderTarget = isRenderTarget;
   texture->vertexEnable = vertexEnable;


   if ( gcmTexture->gpuAddressId != GMM_ERROR )
      rglPlatformDestroyTexture( texture );

   rglGcmBufferObject *gcmBuffer = ( rglGcmBufferObject * ) & bufferObject->platformBufferObject;

   gcmTexture->gpuLayout = newLayout;
   gcmTexture->pool = gcmBuffer->pool;
   gcmTexture->gpuAddressId = gcmBuffer->bufferId;
   gcmTexture->gpuAddressIdOffset = offset;
   gcmTexture->gpuSize = rglGetGcmTextureSize( &newLayout );

   texture->revalidate &= ~( RGL_TEXTURE_REVALIDATE_LAYOUT | RGL_TEXTURE_REVALIDATE_IMAGES );
   texture->revalidate |= RGL_TEXTURE_REVALIDATE_PARAMETERS;
   rglTextureTouchFBOs( texture );
   return GL_TRUE;
}


// GlSetRenderTarget implementation starts here

// Render target rt's color and depth buffer parameters are updated with args
// Fifo functions are called as required
void static inline rglGcmSetColorDepthBuffers( rglGcmRenderTarget *rt, rglGcmRenderTargetEx const * const args )
{
   CellGcmSurface *   grt = &rt->gcmRenderTarget;

   rt->colorBufferCount = args->colorBufferCount;

   // remember rt for swap and clip related functions
   GLuint oldHeight;
   GLuint oldyInverted;

   oldyInverted = rt->yInverted;
   oldHeight = rt->gcmRenderTarget.height;

   if ( rt->colorFormat != ( GLuint )args->colorFormat )
   {
      // ARGB8 and FP16 interpret some registers differently
      rglGcmBlendState *blend = &rglGcmState_i.state.blend;
      rt->colorFormat  = args->colorFormat;
      rglGcmFifoGlBlendColor( blend->r, blend->g, blend->b, blend->a );
   }

   GLuint i = 0;

   for ( i = 0; i < args->colorBufferCount; i++ )
   {
      // choose context based on top bit of offset
      if ( args->colorPitch[i] == 0 )
      {
         grt->colorOffset[i] = 0;
         grt->colorPitch[i] = 0x200;
         grt->colorLocation[i] = CELL_GCM_LOCATION_LOCAL;
      }
      else
      {
         if ( args->colorId[i] != GMM_ERROR )
         {
            grt->colorLocation[i] = CELL_GCM_LOCATION_LOCAL;
            grt->colorOffset[i] = gmmIdToOffset(args->colorId[i]) + args->colorIdOffset[i];
            grt->colorPitch[i] = args->colorPitch[i];
         }
      }
   }

   // fill in the other render targets that haven't been set
   for ( ; i < RGLGCM_SETRENDERTARGET_MAXCOUNT; i++ )
   {
      grt->colorOffset[i] = grt->colorOffset[0];
      grt->colorPitch[i]   = grt->colorPitch[0];
      grt->colorLocation[i] = grt->colorLocation[0];
   }

   rt->yInverted = args->yInverted;
   grt->x        = args->xOffset;
   grt->y        = args->yOffset;
   grt->width    = args->width;
   grt->height   = args->height;

   // scissor enabled/viewport and height changed ? obey yInverted
   if (( grt->height != oldHeight ) | ( rt->yInverted != oldyInverted ) )
   {
      rglGcmViewportState *v = &rglGcmState_i.state.viewport;
      rglGcmFifoGlViewport( v->x, v->y, v->w, v->h );
   }
}


// Update rt's color and depth format with args
void static inline rglGcmSetColorDepthFormats( rglGcmRenderTarget *rt, rglGcmRenderTargetEx const * const args )
{
   CellGcmSurface *   grt = &rt->gcmRenderTarget;

   // set the color format
   switch ( args->colorFormat )
   {
      case RGLGCM_NONE:
      case RGLGCM_ARGB8:
         // choose a fake format
         // but choose a 16-bit format if depth is 16-bit
         grt->colorFormat = CELL_GCM_SURFACE_A8R8G8B8;
         break;
      case RGLGCM_FLOAT_R32:
         grt->colorFormat = CELL_GCM_SURFACE_F_X32;
         break;
      default:
         break;
   }

   // set the depth format
   // choose a fake format
   grt->depthFormat = CELL_GCM_SURFACE_Z24S8;
   grt->depthLocation = CELL_GCM_LOCATION_LOCAL;
   grt->depthOffset = 0;
   grt->depthPitch = 64;
}

// Update rt's color targets
static void inline rglGcmSetTarget( rglGcmRenderTarget *rt, rglGcmRenderTargetEx const * const args )
{
   CellGcmSurface *   grt = &rt->gcmRenderTarget;

   // set target combo
   switch ( rt->colorBufferCount )
   {
      case 0:
         grt->colorTarget = CELL_GCM_SURFACE_TARGET_NONE;
         break;
      case 1:
         grt->colorTarget = CELL_GCM_SURFACE_TARGET_1;
         break;
      case 2:
         grt->colorTarget = CELL_GCM_SURFACE_TARGET_MRT1;
         break;
      case 3:
         grt->colorTarget = CELL_GCM_SURFACE_TARGET_MRT2;
         break;
      case 4:
         grt->colorTarget = CELL_GCM_SURFACE_TARGET_MRT3;
         break;
   }
}

// Set current render target to args
void rglGcmFifoGlSetRenderTarget( rglGcmRenderTargetEx const * const args )
{
   rglGcmRenderTarget *rt = &rglGcmState_i.renderTarget;
   CellGcmSurface *   grt = &rglGcmState_i.renderTarget.gcmRenderTarget;

   rglGcmSetColorDepthBuffers( rt, args );
   rglGcmSetColorDepthFormats( rt, args );

   // Update rt's AA and Swizzling parameters with args

   GCM_FUNC( cellGcmSetAntiAliasingControl,
         CELL_GCM_FALSE,
         CELL_GCM_FALSE,
         CELL_GCM_FALSE,
         0xFFFF);

   grt->type = CELL_GCM_SURFACE_PITCH;
   rglGcmSetTarget( rt, args );

   // ensure if either width or height is 1 the other is one as well
   if ( grt->width == 1 )
      grt->height = 1;
   else
      if ( grt->height == 1 )
         grt->width = 1;

   GCM_FUNC( cellGcmSetSurface, grt );
}

/*============================================================
  PLATFORM TNL
  ============================================================ */

void rglValidateVertexProgram()
{
   // if validation is required, it means the program  has to be downloaded.
   RGLcontext*	LContext = _CurrentContext;

   rglSetNativeCgVertexProgram(LContext->BoundVertexProgram);

   // Set all uniforms.
   if(!(LContext->needValidate & RGL_VALIDATE_VERTEX_CONSTANTS) && LContext->BoundVertexProgram->parentContext)
      rglValidateVertexConstants();
}

void rglValidateVertexConstants()
{
   RGLcontext*	LContext = _CurrentContext;

   rglGcmPushProgramPushBuffer( LContext->BoundVertexProgram );
}

/*============================================================
  UTILS
  ============================================================ */

void rglDrawUtilQuad( GLboolean useFixedVP, GLboolean useFixedFP, GLuint x, GLuint y, GLuint width, GLuint height )
{
   RGLcontext*	LContext = _CurrentContext;

   rglGcmFifoGlDisable( RGLGCM_BLEND );

   GCM_FUNC( cellGcmSetFrontPolygonMode, CELL_GCM_POLYGON_MODE_FILL );
   GCM_FUNC( cellGcmSetBackPolygonMode, CELL_GCM_POLYGON_MODE_FILL );
   GCM_FUNC( cellGcmSetUserClipPlaneControl,
         CELL_GCM_USER_CLIP_PLANE_DISABLE,
         CELL_GCM_USER_CLIP_PLANE_DISABLE,
         CELL_GCM_USER_CLIP_PLANE_DISABLE,
         CELL_GCM_USER_CLIP_PLANE_DISABLE,
         CELL_GCM_USER_CLIP_PLANE_DISABLE,
         CELL_GCM_USER_CLIP_PLANE_DISABLE );

   rglGcmFifoGlViewport( x, y, width, height, 0.0f, 1.0f );

   GCM_FUNC_NO_ARGS( cellGcmSetInvalidateVertexCache );
   rglGcmFifoGlDrawArrays( RGLGCM_TRIANGLE_STRIP, 0, 4 );

   LContext->needValidate |= RGL_VALIDATE_BLENDING | RGL_VALIDATE_VIEWPORT;
}