2016-05-07 04:33:54 +02:00
# include <stdint.h>
2016-05-10 05:50:24 +02:00
# include <stdarg.h>
# include <stddef.h> /* ptrdiff_t on osx */
2016-05-07 04:33:54 +02:00
# include <stdlib.h>
# include <string.h>
2016-05-10 05:50:24 +02:00
# include <retro_assert.h>
2016-05-07 04:33:54 +02:00
# include <retro_inline.h>
2016-05-10 05:50:24 +02:00
# include <boolean.h>
2016-05-07 06:16:02 +02:00
# include <formats/image.h>
2016-05-07 04:33:54 +02:00
enum
{
2016-05-10 05:50:24 +02:00
STBI_default = 0 , /* only used for req_comp */
STBI_grey = 1 ,
STBI_grey_alpha = 2 ,
STBI_rgb = 3 ,
STBI_rgb_alpha = 4
2016-05-07 04:33:54 +02:00
} ;
2016-05-10 05:50:24 +02:00
typedef struct
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
int ( * read ) ( void * user , char * data , int size ) ; /* fill 'data' with 'size' bytes. return number of bytes actually read */
void ( * skip ) ( void * user , int n ) ; /* skip the next 'n' bytes, or 'unget' the last -n bytes if negative */
int ( * eof ) ( void * user ) ; /* returns nonzero if we are at end of file/data */
} stbi_io_callbacks ;
/* should produce compiler error if size is wrong */
typedef unsigned char validate_uint32 [ sizeof ( uint32_t ) = = 4 ? 1 : - 1 ] ;
# ifdef _MSC_VER
# define STBI_NOTUSED(v) (void)(v)
# else
# define STBI_NOTUSED(v) (void)sizeof(v)
# endif
# ifdef _MSC_VER
# define STBI_HAS_LROTL
# endif
# ifdef STBI_HAS_LROTL
# define stbi_lrot(x,y) _lrotl(x,y)
# else
# define stbi_lrot(x,y) (((x) << (y)) | ((x) >> (32 - (y))))
# endif
// x86/x64 detection
# if defined(__x86_64__) || defined(_M_X64)
# define STBI__X64_TARGET
# elif defined(__i386) || defined(_M_IX86)
# define STBI__X86_TARGET
# endif
# if defined(__GNUC__) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET)) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
/* NOTE: not clear do we actually need this for the 64-bit path?
* gcc doesn ' t support sse2 intrinsics unless you compile with - msse2 ,
* ( but compiling with - msse2 allows the compiler to use SSE2 everywhere ;
* this is just broken and gcc are jerks for not fixing it properly
* http : //www.virtualdub.org/blog/pivot/entry.php?id=363 )
*/
# define STBI_NO_SIMD
# endif
# if defined(__MINGW32__) && defined(STBI__X86_TARGET) && !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
/* Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid STBI__X64_TARGET
*
* 32 - bit MinGW wants ESP to be 16 - byte aligned , but this is not in the
* Windows ABI and VC + + as well as Windows DLLs don ' t maintain that invariant .
* As a result , enabling SSE2 on 32 - bit MinGW is dangerous when not
* simultaneously enabling " -mstackrealign " .
*
* See https : //github.com/nothings/stb/issues/81 for more information.
*
* So default to no SSE2 on 32 - bit MinGW . If you ' ve read this far and added
* - mstackrealign to your build settings , feel free to # define STBI_MINGW_ENABLE_SSE2 .
*/
# define STBI_NO_SIMD
# endif
# if !defined(STBI_NO_SIMD) && defined(STBI__X86_TARGET)
# define STBI_SSE2
# include <emmintrin.h>
# ifdef _MSC_VER
# if _MSC_VER >= 1400 /* not VC6 */
# include <intrin.h> /* __cpuid */
static int stbi__cpuid3 ( void )
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
int info [ 4 ] ;
__cpuid ( info , 1 ) ;
return info [ 3 ] ;
}
# else
static int stbi__cpuid3 ( void )
{
int res ;
__asm {
mov eax , 1
cpuid
mov res , edx
}
return res ;
}
# endif
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
# define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
static int stbi__sse2_available ( )
{
int info3 = stbi__cpuid3 ( ) ;
return ( ( info3 > > 26 ) & 1 ) ! = 0 ;
}
# else /* assume GCC-style if not VC++ */
# define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
static int stbi__sse2_available ( )
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
# if defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 /* GCC 4.8 or later */
/* GCC 4.8+ has a nice way to do this */
return __builtin_cpu_supports ( " sse2 " ) ;
# else
/* portable way to do this, preferably without using GCC inline ASM?
* just bail for now . */
return 0 ;
# endif
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
# endif
# endif
/* ARM NEON */
# if defined(STBI_NO_SIMD) && defined(STBI_NEON)
# undef STBI_NEON
# endif
# ifdef STBI_NEON
# include <arm_neon.h>
/* assume GCC or Clang on ARM targets */
# define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
# endif
# ifndef STBI_SIMD_ALIGN
# define STBI_SIMD_ALIGN(type, name) type name
# endif
///////////////////////////////////////////////
//
// stbi__context struct and start_xxx functions
// stbi__context structure is our basic context used by all images, so it
// contains all the IO context, plus some basic image information
typedef struct
{
uint32_t img_x , img_y ;
int img_n , img_out_n ;
stbi_io_callbacks io ;
void * io_user_data ;
int read_from_callbacks ;
int buflen ;
uint8_t buffer_start [ 128 ] ;
uint8_t * img_buffer , * img_buffer_end ;
uint8_t * img_buffer_original ;
} stbi__context ;
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
static void stbi__refill_buffer ( stbi__context * s ) ;
// initialize a memory-decode context
static void stbi__start_mem ( stbi__context * s , const uint8_t * buffer , int len )
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
s - > io . read = NULL ;
s - > read_from_callbacks = 0 ;
s - > img_buffer = s - > img_buffer_original = ( uint8_t * ) buffer ;
s - > img_buffer_end = ( uint8_t * ) buffer + len ;
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
static void stbi__rewind ( stbi__context * s )
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
/* conceptually rewind SHOULD rewind to the beginning of the stream,
* but we just rewind to the beginning of the initial buffer , because
* we only use it after doing ' test ' , which only ever looks at at most 92 bytes
*/
s - > img_buffer = s - > img_buffer_original ;
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
# ifndef STBI_NO_JPEG
static int stbi__jpeg_test ( stbi__context * s ) ;
static uint8_t * stbi__jpeg_load ( stbi__context * s , int * x , int * y , int * comp , int req_comp ) ;
# endif
// this is not threadsafe
static const char * stbi__g_failure_reason ;
static int stbi__err ( const char * str )
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
stbi__g_failure_reason = str ;
return 0 ;
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
// stbi__err - error
// stbi__errpf - error returning pointer to float
// stbi__errpuc - error returning pointer to unsigned char
# ifdef STBI_NO_FAILURE_STRINGS
# define stbi__err(x,y) 0
# elif defined(STBI_FAILURE_USERMSG)
# define stbi__err(x,y) stbi__err(y)
# else
# define stbi__err(x,y) stbi__err(x)
# endif
# define stbi__errpf(x,y) ((float *) (stbi__err(x,y)?NULL:NULL))
# define stbi__errpuc(x,y) ((unsigned char *) (stbi__err(x,y)?NULL:NULL))
static int stbi__vertically_flip_on_load = 0 ;
static unsigned char * stbi__load_main ( stbi__context * s , int * x , int * y , int * comp , int req_comp )
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
# ifndef STBI_NO_JPEG
if ( stbi__jpeg_test ( s ) ) return stbi__jpeg_load ( s , x , y , comp , req_comp ) ;
# endif
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
return stbi__errpuc ( " unknown image type " , " Image not of any known type, or corrupt " ) ;
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
static unsigned char * stbi__load_flip ( stbi__context * s , int * x , int * y , int * comp , int req_comp )
{
unsigned char * result = stbi__load_main ( s , x , y , comp , req_comp ) ;
if ( stbi__vertically_flip_on_load & & result ! = NULL ) {
int w = * x , h = * y ;
int depth = req_comp ? req_comp : * comp ;
int row , col , z ;
uint8_t temp ;
// @OPTIMIZE: use a bigger temp buffer and memcpy multiple pixels at once
for ( row = 0 ; row < ( h > > 1 ) ; row + + ) {
for ( col = 0 ; col < w ; col + + ) {
for ( z = 0 ; z < depth ; z + + ) {
temp = result [ ( row * w + col ) * depth + z ] ;
result [ ( row * w + col ) * depth + z ] = result [ ( ( h - row - 1 ) * w + col ) * depth + z ] ;
result [ ( ( h - row - 1 ) * w + col ) * depth + z ] = temp ;
}
}
}
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
return result ;
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
static uint8_t * stbi_load_from_memory ( const uint8_t * buffer , int len , int * x , int * y , int * comp , int req_comp )
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
stbi__context s ;
stbi__start_mem ( & s , buffer , len ) ;
return stbi__load_flip ( & s , x , y , comp , req_comp ) ;
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
//////////////////////////////////////////////////////////////////////////////
//
// Common code used by all image loaders
//
enum
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
STBI__SCAN_load = 0 ,
STBI__SCAN_type ,
STBI__SCAN_header
} ;
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
static void stbi__refill_buffer ( stbi__context * s )
{
int n = ( s - > io . read ) ( s - > io_user_data , ( char * ) s - > buffer_start , s - > buflen ) ;
if ( n = = 0 ) {
// at end of file, treat same as if from memory, but need to handle case
// where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
s - > read_from_callbacks = 0 ;
s - > img_buffer = s - > buffer_start ;
s - > img_buffer_end = s - > buffer_start + 1 ;
* s - > img_buffer = 0 ;
} else {
s - > img_buffer = s - > buffer_start ;
s - > img_buffer_end = s - > buffer_start + n ;
}
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
static INLINE uint8_t stbi__get8 ( stbi__context * s )
{
if ( s - > img_buffer < s - > img_buffer_end )
return * s - > img_buffer + + ;
if ( s - > read_from_callbacks ) {
stbi__refill_buffer ( s ) ;
return * s - > img_buffer + + ;
}
return 0 ;
}
static INLINE int stbi__at_eof ( stbi__context * s )
{
if ( s - > io . read ) {
if ( ! ( s - > io . eof ) ( s - > io_user_data ) ) return 0 ;
// if feof() is true, check if buffer = end
// special case: we've only got the special 0 character at the end
if ( s - > read_from_callbacks = = 0 ) return 1 ;
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
return s - > img_buffer > = s - > img_buffer_end ;
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
static void stbi__skip ( stbi__context * s , int n )
{
if ( n < 0 ) {
s - > img_buffer = s - > img_buffer_end ;
return ;
}
if ( s - > io . read ) {
int blen = ( int ) ( s - > img_buffer_end - s - > img_buffer ) ;
if ( blen < n ) {
s - > img_buffer = s - > img_buffer_end ;
( s - > io . skip ) ( s - > io_user_data , n - blen ) ;
return ;
}
}
s - > img_buffer + = n ;
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
static int stbi__get16be ( stbi__context * s )
{
int z = stbi__get8 ( s ) ;
return ( z < < 8 ) + stbi__get8 ( s ) ;
}
2016-05-07 18:35:06 +02:00
2016-05-10 05:50:24 +02:00
# define STBI__BYTECAST(x) ((uint8_t) ((x) & 255)) // truncate int to byte without warnings
//////////////////////////////////////////////////////////////////////////////
//
// "baseline" JPEG/JFIF decoder
//
// simple implementation
// - doesn't support delayed output of y-dimension
// - simple interface (only one output format: 8-bit interleaved RGB)
// - doesn't try to recover corrupt jpegs
// - doesn't allow partial loading, loading multiple at once
// - still fast on x86 (copying globals into locals doesn't help x86)
// - allocates lots of intermediate memory (full size of all components)
// - non-interleaved case requires this anyway
// - allows good upsampling (see next)
// high-quality
// - upsampled channels are bilinearly interpolated, even across blocks
// - quality integer IDCT derived from IJG's 'slow'
// performance
// - fast huffman; reasonable integer IDCT
// - some SIMD kernels for common paths on targets with SSE2/NEON
// - uses a lot of intermediate memory, could cache poorly
# ifndef STBI_NO_JPEG
// huffman decoding acceleration
# define FAST_BITS 9 // larger handles more cases; smaller stomps less cache
typedef struct
{
uint8_t fast [ 1 < < FAST_BITS ] ;
// weirdly, repacking this into AoS is a 10% speed loss, instead of a win
uint16_t code [ 256 ] ;
uint8_t values [ 256 ] ;
uint8_t size [ 257 ] ;
unsigned int maxcode [ 18 ] ;
int delta [ 17 ] ; // old 'firstsymbol' - old 'firstcode'
} stbi__huffman ;
typedef struct
{
stbi__context * s ;
stbi__huffman huff_dc [ 4 ] ;
stbi__huffman huff_ac [ 4 ] ;
uint8_t dequant [ 4 ] [ 64 ] ;
int16_t fast_ac [ 4 ] [ 1 < < FAST_BITS ] ;
// sizes for components, interleaved MCUs
int img_h_max , img_v_max ;
int img_mcu_x , img_mcu_y ;
int img_mcu_w , img_mcu_h ;
// definition of jpeg image component
struct
{
int id ;
int h , v ;
int tq ;
int hd , ha ;
int dc_pred ;
int x , y , w2 , h2 ;
uint8_t * data ;
void * raw_data , * raw_coeff ;
uint8_t * linebuf ;
short * coeff ; // progressive only
int coeff_w , coeff_h ; // number of 8x8 coefficient blocks
} img_comp [ 4 ] ;
uint32_t code_buffer ; // jpeg entropy-coded buffer
int code_bits ; // number of valid bits
unsigned char marker ; // marker seen while filling entropy buffer
int nomore ; // flag if we saw a marker so must stop
int progressive ;
int spec_start ;
int spec_end ;
int succ_high ;
int succ_low ;
int eob_run ;
int scan_n , order [ 4 ] ;
int restart_interval , todo ;
// kernels
void ( * idct_block_kernel ) ( uint8_t * out , int out_stride , short data [ 64 ] ) ;
void ( * YCbCr_to_RGB_kernel ) ( uint8_t * out , const uint8_t * y , const uint8_t * pcb , const uint8_t * pcr , int count , int step ) ;
uint8_t * ( * resample_row_hv_2_kernel ) ( uint8_t * out , uint8_t * in_near , uint8_t * in_far , int w , int hs ) ;
} stbi__jpeg ;
static int stbi__build_huffman ( stbi__huffman * h , int * count )
{
int i , j , k = 0 , code ;
// build size list for each symbol (from JPEG spec)
for ( i = 0 ; i < 16 ; + + i )
for ( j = 0 ; j < count [ i ] ; + + j )
h - > size [ k + + ] = ( uint8_t ) ( i + 1 ) ;
h - > size [ k ] = 0 ;
// compute actual symbols (from jpeg spec)
code = 0 ;
k = 0 ;
for ( j = 1 ; j < = 16 ; + + j ) {
// compute delta to add to code to compute symbol id
h - > delta [ j ] = k - code ;
if ( h - > size [ k ] = = j ) {
while ( h - > size [ k ] = = j )
h - > code [ k + + ] = ( uint16_t ) ( code + + ) ;
if ( code - 1 > = ( 1 < < j ) ) return stbi__err ( " bad code lengths " , " Corrupt JPEG " ) ;
}
// compute largest code + 1 for this size, preshifted as needed later
h - > maxcode [ j ] = code < < ( 16 - j ) ;
code < < = 1 ;
}
h - > maxcode [ j ] = 0xffffffff ;
// build non-spec acceleration table; 255 is flag for not-accelerated
memset ( h - > fast , 255 , 1 < < FAST_BITS ) ;
for ( i = 0 ; i < k ; + + i ) {
int s = h - > size [ i ] ;
if ( s < = FAST_BITS ) {
int c = h - > code [ i ] < < ( FAST_BITS - s ) ;
int m = 1 < < ( FAST_BITS - s ) ;
for ( j = 0 ; j < m ; + + j ) {
h - > fast [ c + j ] = ( uint8_t ) i ;
2016-05-07 04:33:54 +02:00
}
}
2016-05-10 05:50:24 +02:00
}
return 1 ;
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
// build a table that decodes both magnitude and value of small ACs in
// one go.
static void stbi__build_fast_ac ( int16_t * fast_ac , stbi__huffman * h )
{
int i ;
for ( i = 0 ; i < ( 1 < < FAST_BITS ) ; + + i ) {
uint8_t fast = h - > fast [ i ] ;
fast_ac [ i ] = 0 ;
if ( fast < 255 ) {
int rs = h - > values [ fast ] ;
int run = ( rs > > 4 ) & 15 ;
int magbits = rs & 15 ;
int len = h - > size [ fast ] ;
if ( magbits & & len + magbits < = FAST_BITS ) {
// magnitude code followed by receive_extend code
int k = ( ( i < < len ) & ( ( 1 < < FAST_BITS ) - 1 ) ) > > ( FAST_BITS - magbits ) ;
int m = 1 < < ( magbits - 1 ) ;
if ( k < m ) k + = ( - 1 < < magbits ) + 1 ;
// if the result is small enough, we can fit it in fast_ac table
if ( k > = - 128 & & k < = 127 )
fast_ac [ i ] = ( int16_t ) ( ( k < < 8 ) + ( run < < 4 ) + ( len + magbits ) ) ;
}
2016-05-07 04:33:54 +02:00
}
}
2016-05-10 05:50:24 +02:00
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
static void stbi__grow_buffer_unsafe ( stbi__jpeg * j )
{
do {
int b = j - > nomore ? 0 : stbi__get8 ( j - > s ) ;
if ( b = = 0xff ) {
int c = stbi__get8 ( j - > s ) ;
if ( c ! = 0 ) {
j - > marker = ( unsigned char ) c ;
j - > nomore = 1 ;
return ;
}
}
j - > code_buffer | = b < < ( 24 - j - > code_bits ) ;
j - > code_bits + = 8 ;
} while ( j - > code_bits < = 24 ) ;
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
// (1 << n) - 1
static uint32_t stbi__bmask [ 17 ] = { 0 , 1 , 3 , 7 , 15 , 31 , 63 , 127 , 255 , 511 , 1023 , 2047 , 4095 , 8191 , 16383 , 32767 , 65535 } ;
// decode a jpeg huffman value from the bitstream
static INLINE int stbi__jpeg_huff_decode ( stbi__jpeg * j , stbi__huffman * h )
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
unsigned int temp ;
int c , k ;
if ( j - > code_bits < 16 ) stbi__grow_buffer_unsafe ( j ) ;
// look at the top FAST_BITS and determine what symbol ID it is,
// if the code is <= FAST_BITS
c = ( j - > code_buffer > > ( 32 - FAST_BITS ) ) & ( ( 1 < < FAST_BITS ) - 1 ) ;
k = h - > fast [ c ] ;
if ( k < 255 ) {
int s = h - > size [ k ] ;
if ( s > j - > code_bits )
return - 1 ;
j - > code_buffer < < = s ;
j - > code_bits - = s ;
return h - > values [ k ] ;
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
// naive test is to shift the code_buffer down so k bits are
// valid, then test against maxcode. To speed this up, we've
// preshifted maxcode left so that it has (16-k) 0s at the
// end; in other words, regardless of the number of bits, it
// wants to be compared against something shifted to have 16;
// that way we don't need to shift inside the loop.
temp = j - > code_buffer > > 16 ;
for ( k = FAST_BITS + 1 ; ; + + k )
if ( temp < h - > maxcode [ k ] )
break ;
if ( k = = 17 ) {
// error! code not found
j - > code_bits - = 16 ;
return - 1 ;
}
if ( k > j - > code_bits )
return - 1 ;
// convert the huffman code to the symbol id
c = ( ( j - > code_buffer > > ( 32 - k ) ) & stbi__bmask [ k ] ) + h - > delta [ k ] ;
assert ( ( ( ( j - > code_buffer ) > > ( 32 - h - > size [ c ] ) ) & stbi__bmask [ h - > size [ c ] ] ) = = h - > code [ c ] ) ;
// convert the id to a symbol
j - > code_bits - = k ;
j - > code_buffer < < = k ;
return h - > values [ c ] ;
}
// bias[n] = (-1<<n) + 1
static int const stbi__jbias [ 16 ] = { 0 , - 1 , - 3 , - 7 , - 15 , - 31 , - 63 , - 127 , - 255 , - 511 , - 1023 , - 2047 , - 4095 , - 8191 , - 16383 , - 32767 } ;
// combined JPEG 'receive' and JPEG 'extend', since baseline
// always extends everything it receives.
static INLINE int stbi__extend_receive ( stbi__jpeg * j , int n )
{
unsigned int k ;
int sgn ;
if ( j - > code_bits < n ) stbi__grow_buffer_unsafe ( j ) ;
sgn = ( int32_t ) j - > code_buffer > > 31 ; // sign bit is always in MSB
k = stbi_lrot ( j - > code_buffer , n ) ;
assert ( n > = 0 & & n < ( int ) ( sizeof ( stbi__bmask ) / sizeof ( * stbi__bmask ) ) ) ;
j - > code_buffer = k & ~ stbi__bmask [ n ] ;
k & = stbi__bmask [ n ] ;
j - > code_bits - = n ;
return k + ( stbi__jbias [ n ] & ~ sgn ) ;
}
// get some unsigned bits
static INLINE int stbi__jpeg_get_bits ( stbi__jpeg * j , int n )
{
unsigned int k ;
if ( j - > code_bits < n ) stbi__grow_buffer_unsafe ( j ) ;
k = stbi_lrot ( j - > code_buffer , n ) ;
j - > code_buffer = k & ~ stbi__bmask [ n ] ;
k & = stbi__bmask [ n ] ;
j - > code_bits - = n ;
return k ;
}
static INLINE int stbi__jpeg_get_bit ( stbi__jpeg * j )
{
unsigned int k ;
if ( j - > code_bits < 1 ) stbi__grow_buffer_unsafe ( j ) ;
k = j - > code_buffer ;
j - > code_buffer < < = 1 ;
- - j - > code_bits ;
return k & 0x80000000 ;
}
// given a value that's at position X in the zigzag stream,
// where does it appear in the 8x8 matrix coded as row-major?
static uint8_t stbi__jpeg_dezigzag [ 64 + 15 ] =
{
0 , 1 , 8 , 16 , 9 , 2 , 3 , 10 ,
17 , 24 , 32 , 25 , 18 , 11 , 4 , 5 ,
12 , 19 , 26 , 33 , 40 , 48 , 41 , 34 ,
27 , 20 , 13 , 6 , 7 , 14 , 21 , 28 ,
35 , 42 , 49 , 56 , 57 , 50 , 43 , 36 ,
29 , 22 , 15 , 23 , 30 , 37 , 44 , 51 ,
58 , 59 , 52 , 45 , 38 , 31 , 39 , 46 ,
53 , 60 , 61 , 54 , 47 , 55 , 62 , 63 ,
// let corrupt input sample past end
63 , 63 , 63 , 63 , 63 , 63 , 63 , 63 ,
63 , 63 , 63 , 63 , 63 , 63 , 63
} ;
// decode one 64-entry block--
static int stbi__jpeg_decode_block ( stbi__jpeg * j , short data [ 64 ] , stbi__huffman * hdc , stbi__huffman * hac , int16_t * fac , int b , uint8_t * dequant )
{
int diff , dc , k ;
int t ;
if ( j - > code_bits < 16 ) stbi__grow_buffer_unsafe ( j ) ;
t = stbi__jpeg_huff_decode ( j , hdc ) ;
if ( t < 0 ) return stbi__err ( " bad huffman code " , " Corrupt JPEG " ) ;
// 0 all the ac values now so we can do it 32-bits at a time
memset ( data , 0 , 64 * sizeof ( data [ 0 ] ) ) ;
diff = t ? stbi__extend_receive ( j , t ) : 0 ;
dc = j - > img_comp [ b ] . dc_pred + diff ;
j - > img_comp [ b ] . dc_pred = dc ;
data [ 0 ] = ( short ) ( dc * dequant [ 0 ] ) ;
// decode AC components, see JPEG spec
k = 1 ;
do {
unsigned int zig ;
int c , r , s ;
if ( j - > code_bits < 16 ) stbi__grow_buffer_unsafe ( j ) ;
c = ( j - > code_buffer > > ( 32 - FAST_BITS ) ) & ( ( 1 < < FAST_BITS ) - 1 ) ;
r = fac [ c ] ;
if ( r ) { // fast-AC path
k + = ( r > > 4 ) & 15 ; // run
s = r & 15 ; // combined length
j - > code_buffer < < = s ;
j - > code_bits - = s ;
// decode into unzigzag'd location
zig = stbi__jpeg_dezigzag [ k + + ] ;
data [ zig ] = ( short ) ( ( r > > 8 ) * dequant [ zig ] ) ;
} else {
int rs = stbi__jpeg_huff_decode ( j , hac ) ;
if ( rs < 0 ) return stbi__err ( " bad huffman code " , " Corrupt JPEG " ) ;
s = rs & 15 ;
r = rs > > 4 ;
if ( s = = 0 ) {
if ( rs ! = 0xf0 ) break ; // end block
k + = 16 ;
} else {
k + = r ;
// decode into unzigzag'd location
zig = stbi__jpeg_dezigzag [ k + + ] ;
data [ zig ] = ( short ) ( stbi__extend_receive ( j , s ) * dequant [ zig ] ) ;
}
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
} while ( k < 64 ) ;
return 1 ;
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
static int stbi__jpeg_decode_block_prog_dc ( stbi__jpeg * j , short data [ 64 ] , stbi__huffman * hdc , int b )
{
if ( j - > spec_end ! = 0 )
return stbi__err ( " can't merge dc and ac " , " Corrupt JPEG " ) ;
2016-05-07 18:35:06 +02:00
2016-05-10 05:50:24 +02:00
if ( j - > code_bits < 16 )
stbi__grow_buffer_unsafe ( j ) ;
if ( j - > succ_high = = 0 )
{
int diff , dc ;
int t ;
/* first scan for DC coefficient, must be first */
memset ( data , 0 , 64 * sizeof ( data [ 0 ] ) ) ; // 0 all the ac values now
t = stbi__jpeg_huff_decode ( j , hdc ) ;
diff = t ? stbi__extend_receive ( j , t ) : 0 ;
dc = j - > img_comp [ b ] . dc_pred + diff ;
j - > img_comp [ b ] . dc_pred = dc ;
data [ 0 ] = ( short ) ( dc < < j - > succ_low ) ;
}
else
{
/* refinement scan for DC coefficient */
if ( stbi__jpeg_get_bit ( j ) )
data [ 0 ] + = ( short ) ( 1 < < j - > succ_low ) ;
}
return 1 ;
}
// @OPTIMIZE: store non-zigzagged during the decode passes,
// and only de-zigzag when dequantizing
static int stbi__jpeg_decode_block_prog_ac ( stbi__jpeg * j , short data [ 64 ] , stbi__huffman * hac , int16_t * fac )
{
int k ;
if ( j - > spec_start = = 0 ) return stbi__err ( " can't merge dc and ac " , " Corrupt JPEG " ) ;
if ( j - > succ_high = = 0 ) {
int shift = j - > succ_low ;
if ( j - > eob_run ) {
- - j - > eob_run ;
return 1 ;
}
k = j - > spec_start ;
do {
unsigned int zig ;
int c , r , s ;
if ( j - > code_bits < 16 ) stbi__grow_buffer_unsafe ( j ) ;
c = ( j - > code_buffer > > ( 32 - FAST_BITS ) ) & ( ( 1 < < FAST_BITS ) - 1 ) ;
r = fac [ c ] ;
if ( r ) { // fast-AC path
k + = ( r > > 4 ) & 15 ; // run
s = r & 15 ; // combined length
j - > code_buffer < < = s ;
j - > code_bits - = s ;
zig = stbi__jpeg_dezigzag [ k + + ] ;
data [ zig ] = ( short ) ( ( r > > 8 ) < < shift ) ;
} else {
int rs = stbi__jpeg_huff_decode ( j , hac ) ;
if ( rs < 0 ) return stbi__err ( " bad huffman code " , " Corrupt JPEG " ) ;
s = rs & 15 ;
r = rs > > 4 ;
if ( s = = 0 ) {
if ( r < 15 ) {
j - > eob_run = ( 1 < < r ) ;
if ( r )
j - > eob_run + = stbi__jpeg_get_bits ( j , r ) ;
- - j - > eob_run ;
2016-05-07 04:33:54 +02:00
break ;
2016-05-10 05:50:24 +02:00
}
k + = 16 ;
} else {
k + = r ;
zig = stbi__jpeg_dezigzag [ k + + ] ;
data [ zig ] = ( short ) ( stbi__extend_receive ( j , s ) < < shift ) ;
}
}
} while ( k < = j - > spec_end ) ;
} else {
// refinement scan for these AC coefficients
short bit = ( short ) ( 1 < < j - > succ_low ) ;
if ( j - > eob_run ) {
- - j - > eob_run ;
for ( k = j - > spec_start ; k < = j - > spec_end ; + + k ) {
short * p = & data [ stbi__jpeg_dezigzag [ k ] ] ;
if ( * p ! = 0 )
if ( stbi__jpeg_get_bit ( j ) )
if ( ( * p & bit ) = = 0 ) {
if ( * p > 0 )
* p + = bit ;
else
* p - = bit ;
}
}
} else {
k = j - > spec_start ;
do {
int r , s ;
int rs = stbi__jpeg_huff_decode ( j , hac ) ; // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
if ( rs < 0 ) return stbi__err ( " bad huffman code " , " Corrupt JPEG " ) ;
s = rs & 15 ;
r = rs > > 4 ;
if ( s = = 0 ) {
if ( r < 15 ) {
j - > eob_run = ( 1 < < r ) - 1 ;
if ( r )
j - > eob_run + = stbi__jpeg_get_bits ( j , r ) ;
r = 64 ; // force end of block
} else {
// r=15 s=0 should write 16 0s, so we just do
// a run of 15 0s and then write s (which is 0),
// so we don't have to do anything special here
}
} else {
if ( s ! = 1 ) return stbi__err ( " bad huffman code " , " Corrupt JPEG " ) ;
// sign bit
if ( stbi__jpeg_get_bit ( j ) )
s = bit ;
else
s = - bit ;
}
// advance by r
while ( k < = j - > spec_end ) {
short * p = & data [ stbi__jpeg_dezigzag [ k + + ] ] ;
if ( * p ! = 0 ) {
if ( stbi__jpeg_get_bit ( j ) )
if ( ( * p & bit ) = = 0 ) {
if ( * p > 0 )
* p + = bit ;
else
* p - = bit ;
}
} else {
if ( r = = 0 ) {
* p = ( short ) s ;
break ;
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
- - r ;
}
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
} while ( k < = j - > spec_end ) ;
2016-05-07 04:33:54 +02:00
}
}
2016-05-10 05:50:24 +02:00
return 1 ;
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
// take a -128..127 value and stbi__clamp it and convert to 0..255
static INLINE uint8_t stbi__clamp ( int x )
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
// trick to use a single test to catch both cases
if ( ( unsigned int ) x > 255 ) {
if ( x < 0 ) return 0 ;
if ( x > 255 ) return 255 ;
}
return ( uint8_t ) x ;
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
# define stbi__f2f(x) ((int) (((x) * 4096 + 0.5)))
# define stbi__fsh(x) ((x) << 12)
// derived from jidctint -- DCT_ISLOW
# define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
int t0 , t1 , t2 , t3 , p1 , p2 , p3 , p4 , p5 , x0 , x1 , x2 , x3 ; \
p2 = s2 ; \
p3 = s6 ; \
p1 = ( p2 + p3 ) * stbi__f2f ( 0.5411961f ) ; \
t2 = p1 + p3 * stbi__f2f ( - 1.847759065f ) ; \
t3 = p1 + p2 * stbi__f2f ( 0.765366865f ) ; \
p2 = s0 ; \
p3 = s4 ; \
t0 = stbi__fsh ( p2 + p3 ) ; \
t1 = stbi__fsh ( p2 - p3 ) ; \
x0 = t0 + t3 ; \
x3 = t0 - t3 ; \
x1 = t1 + t2 ; \
x2 = t1 - t2 ; \
t0 = s7 ; \
t1 = s5 ; \
t2 = s3 ; \
t3 = s1 ; \
p3 = t0 + t2 ; \
p4 = t1 + t3 ; \
p1 = t0 + t3 ; \
p2 = t1 + t2 ; \
p5 = ( p3 + p4 ) * stbi__f2f ( 1.175875602f ) ; \
t0 = t0 * stbi__f2f ( 0.298631336f ) ; \
t1 = t1 * stbi__f2f ( 2.053119869f ) ; \
t2 = t2 * stbi__f2f ( 3.072711026f ) ; \
t3 = t3 * stbi__f2f ( 1.501321110f ) ; \
p1 = p5 + p1 * stbi__f2f ( - 0.899976223f ) ; \
p2 = p5 + p2 * stbi__f2f ( - 2.562915447f ) ; \
p3 = p3 * stbi__f2f ( - 1.961570560f ) ; \
p4 = p4 * stbi__f2f ( - 0.390180644f ) ; \
t3 + = p1 + p4 ; \
t2 + = p2 + p3 ; \
t1 + = p2 + p4 ; \
t0 + = p1 + p3 ;
static void stbi__idct_block ( uint8_t * out , int out_stride , short data [ 64 ] )
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
int i , val [ 64 ] , * v = val ;
uint8_t * o ;
short * d = data ;
// columns
for ( i = 0 ; i < 8 ; + + i , + + d , + + v ) {
// if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
if ( d [ 8 ] = = 0 & & d [ 16 ] = = 0 & & d [ 24 ] = = 0 & & d [ 32 ] = = 0
& & d [ 40 ] = = 0 & & d [ 48 ] = = 0 & & d [ 56 ] = = 0 ) {
// no shortcut 0 seconds
// (1|2|3|4|5|6|7)==0 0 seconds
// all separate -0.047 seconds
// 1 && 2|3 && 4|5 && 6|7: -0.047 seconds
int dcterm = d [ 0 ] < < 2 ;
v [ 0 ] = v [ 8 ] = v [ 16 ] = v [ 24 ] = v [ 32 ] = v [ 40 ] = v [ 48 ] = v [ 56 ] = dcterm ;
} else {
STBI__IDCT_1D ( d [ 0 ] , d [ 8 ] , d [ 16 ] , d [ 24 ] , d [ 32 ] , d [ 40 ] , d [ 48 ] , d [ 56 ] )
// constants scaled things up by 1<<12; let's bring them back
// down, but keep 2 extra bits of precision
x0 + = 512 ; x1 + = 512 ; x2 + = 512 ; x3 + = 512 ;
v [ 0 ] = ( x0 + t3 ) > > 10 ;
v [ 56 ] = ( x0 - t3 ) > > 10 ;
v [ 8 ] = ( x1 + t2 ) > > 10 ;
v [ 48 ] = ( x1 - t2 ) > > 10 ;
v [ 16 ] = ( x2 + t1 ) > > 10 ;
v [ 40 ] = ( x2 - t1 ) > > 10 ;
v [ 24 ] = ( x3 + t0 ) > > 10 ;
v [ 32 ] = ( x3 - t0 ) > > 10 ;
}
}
for ( i = 0 , v = val , o = out ; i < 8 ; + + i , v + = 8 , o + = out_stride ) {
// no fast case since the first 1D IDCT spread components out
STBI__IDCT_1D ( v [ 0 ] , v [ 1 ] , v [ 2 ] , v [ 3 ] , v [ 4 ] , v [ 5 ] , v [ 6 ] , v [ 7 ] )
// constants scaled things up by 1<<12, plus we had 1<<2 from first
// loop, plus horizontal and vertical each scale by sqrt(8) so together
// we've got an extra 1<<3, so 1<<17 total we need to remove.
// so we want to round that, which means adding 0.5 * 1<<17,
// aka 65536. Also, we'll end up with -128 to 127 that we want
// to encode as 0..255 by adding 128, so we'll add that before the shift
x0 + = 65536 + ( 128 < < 17 ) ;
x1 + = 65536 + ( 128 < < 17 ) ;
x2 + = 65536 + ( 128 < < 17 ) ;
x3 + = 65536 + ( 128 < < 17 ) ;
// tried computing the shifts into temps, or'ing the temps to see
// if any were out of range, but that was slower
o [ 0 ] = stbi__clamp ( ( x0 + t3 ) > > 17 ) ;
o [ 7 ] = stbi__clamp ( ( x0 - t3 ) > > 17 ) ;
o [ 1 ] = stbi__clamp ( ( x1 + t2 ) > > 17 ) ;
o [ 6 ] = stbi__clamp ( ( x1 - t2 ) > > 17 ) ;
o [ 2 ] = stbi__clamp ( ( x2 + t1 ) > > 17 ) ;
o [ 5 ] = stbi__clamp ( ( x2 - t1 ) > > 17 ) ;
o [ 3 ] = stbi__clamp ( ( x3 + t0 ) > > 17 ) ;
o [ 4 ] = stbi__clamp ( ( x3 - t0 ) > > 17 ) ;
}
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
# ifdef STBI_SSE2
/* sse2 integer IDCT. not the fastest possible implementation but it
* produces bit - identical results to the generic C version so it ' s
* fully " transparent " .
*/
static void stbi__idct_simd ( uint8_t * out , int out_stride , short data [ 64 ] )
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
/* This is constructed to match our regular (generic) integer IDCT exactly. */
__m128i row0 , row1 , row2 , row3 , row4 , row5 , row6 , row7 ;
__m128i tmp ;
/* dot product constant: even elems=x, odd elems=y */
# define dct_const(x,y) _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
/* out(0) = c0[even]*x + c0[odd]*y (c0, x, y 16-bit, out 32-bit)
* out ( 1 ) = c1 [ even ] * x + c1 [ odd ] * y
*/
# define dct_rot(out0,out1, x,y,c0,c1) \
__m128i c0 # # lo = _mm_unpacklo_epi16 ( ( x ) , ( y ) ) ; \
__m128i c0 # # hi = _mm_unpackhi_epi16 ( ( x ) , ( y ) ) ; \
__m128i out0 # # _l = _mm_madd_epi16 ( c0 # # lo , c0 ) ; \
__m128i out0 # # _h = _mm_madd_epi16 ( c0 # # hi , c0 ) ; \
__m128i out1 # # _l = _mm_madd_epi16 ( c0 # # lo , c1 ) ; \
__m128i out1 # # _h = _mm_madd_epi16 ( c0 # # hi , c1 )
/* out = in << 12 (in 16-bit, out 32-bit) */
# define dct_widen(out, in) \
__m128i out # # _l = _mm_srai_epi32 ( _mm_unpacklo_epi16 ( _mm_setzero_si128 ( ) , ( in ) ) , 4 ) ; \
__m128i out # # _h = _mm_srai_epi32 ( _mm_unpackhi_epi16 ( _mm_setzero_si128 ( ) , ( in ) ) , 4 )
/* wide add */
# define dct_wadd(out, a, b) \
__m128i out # # _l = _mm_add_epi32 ( a # # _l , b # # _l ) ; \
__m128i out # # _h = _mm_add_epi32 ( a # # _h , b # # _h )
/* wide sub */
# define dct_wsub(out, a, b) \
__m128i out # # _l = _mm_sub_epi32 ( a # # _l , b # # _l ) ; \
__m128i out # # _h = _mm_sub_epi32 ( a # # _h , b # # _h )
/* butterfly a/b, add bias, then shift by "s" and pack */
# define dct_bfly32o(out0, out1, a,b,bias,s) \
{ \
__m128i abiased_l = _mm_add_epi32 ( a # # _l , bias ) ; \
__m128i abiased_h = _mm_add_epi32 ( a # # _h , bias ) ; \
dct_wadd ( sum , abiased , b ) ; \
dct_wsub ( dif , abiased , b ) ; \
out0 = _mm_packs_epi32 ( _mm_srai_epi32 ( sum_l , s ) , _mm_srai_epi32 ( sum_h , s ) ) ; \
out1 = _mm_packs_epi32 ( _mm_srai_epi32 ( dif_l , s ) , _mm_srai_epi32 ( dif_h , s ) ) ; \
}
/* 8-bit interleave step (for transposes) */
# define dct_interleave8(a, b) \
tmp = a ; \
a = _mm_unpacklo_epi8 ( a , b ) ; \
b = _mm_unpackhi_epi8 ( tmp , b )
/* 16-bit interleave step (for transposes) */
# define dct_interleave16(a, b) \
tmp = a ; \
a = _mm_unpacklo_epi16 ( a , b ) ; \
b = _mm_unpackhi_epi16 ( tmp , b )
# define dct_pass(bias,shift) \
{ \
/* even part */ \
dct_rot ( t2e , t3e , row2 , row6 , rot0_0 , rot0_1 ) ; \
__m128i sum04 = _mm_add_epi16 ( row0 , row4 ) ; \
__m128i dif04 = _mm_sub_epi16 ( row0 , row4 ) ; \
dct_widen ( t0e , sum04 ) ; \
dct_widen ( t1e , dif04 ) ; \
dct_wadd ( x0 , t0e , t3e ) ; \
dct_wsub ( x3 , t0e , t3e ) ; \
dct_wadd ( x1 , t1e , t2e ) ; \
dct_wsub ( x2 , t1e , t2e ) ; \
/* odd part */ \
dct_rot ( y0o , y2o , row7 , row3 , rot2_0 , rot2_1 ) ; \
dct_rot ( y1o , y3o , row5 , row1 , rot3_0 , rot3_1 ) ; \
__m128i sum17 = _mm_add_epi16 ( row1 , row7 ) ; \
__m128i sum35 = _mm_add_epi16 ( row3 , row5 ) ; \
dct_rot ( y4o , y5o , sum17 , sum35 , rot1_0 , rot1_1 ) ; \
dct_wadd ( x4 , y0o , y4o ) ; \
dct_wadd ( x5 , y1o , y5o ) ; \
dct_wadd ( x6 , y2o , y5o ) ; \
dct_wadd ( x7 , y3o , y4o ) ; \
dct_bfly32o ( row0 , row7 , x0 , x7 , bias , shift ) ; \
dct_bfly32o ( row1 , row6 , x1 , x6 , bias , shift ) ; \
dct_bfly32o ( row2 , row5 , x2 , x5 , bias , shift ) ; \
dct_bfly32o ( row3 , row4 , x3 , x4 , bias , shift ) ; \
}
__m128i rot0_0 = dct_const ( stbi__f2f ( 0.5411961f ) , stbi__f2f ( 0.5411961f ) + stbi__f2f ( - 1.847759065f ) ) ;
__m128i rot0_1 = dct_const ( stbi__f2f ( 0.5411961f ) + stbi__f2f ( 0.765366865f ) , stbi__f2f ( 0.5411961f ) ) ;
__m128i rot1_0 = dct_const ( stbi__f2f ( 1.175875602f ) + stbi__f2f ( - 0.899976223f ) , stbi__f2f ( 1.175875602f ) ) ;
__m128i rot1_1 = dct_const ( stbi__f2f ( 1.175875602f ) , stbi__f2f ( 1.175875602f ) + stbi__f2f ( - 2.562915447f ) ) ;
__m128i rot2_0 = dct_const ( stbi__f2f ( - 1.961570560f ) + stbi__f2f ( 0.298631336f ) , stbi__f2f ( - 1.961570560f ) ) ;
__m128i rot2_1 = dct_const ( stbi__f2f ( - 1.961570560f ) , stbi__f2f ( - 1.961570560f ) + stbi__f2f ( 3.072711026f ) ) ;
__m128i rot3_0 = dct_const ( stbi__f2f ( - 0.390180644f ) + stbi__f2f ( 2.053119869f ) , stbi__f2f ( - 0.390180644f ) ) ;
__m128i rot3_1 = dct_const ( stbi__f2f ( - 0.390180644f ) , stbi__f2f ( - 0.390180644f ) + stbi__f2f ( 1.501321110f ) ) ;
/* rounding biases in column/row passes, see stbi__idct_block for explanation. */
__m128i bias_0 = _mm_set1_epi32 ( 512 ) ;
__m128i bias_1 = _mm_set1_epi32 ( 65536 + ( 128 < < 17 ) ) ;
/* load */
row0 = _mm_load_si128 ( ( const __m128i * ) ( data + 0 * 8 ) ) ;
row1 = _mm_load_si128 ( ( const __m128i * ) ( data + 1 * 8 ) ) ;
row2 = _mm_load_si128 ( ( const __m128i * ) ( data + 2 * 8 ) ) ;
row3 = _mm_load_si128 ( ( const __m128i * ) ( data + 3 * 8 ) ) ;
row4 = _mm_load_si128 ( ( const __m128i * ) ( data + 4 * 8 ) ) ;
row5 = _mm_load_si128 ( ( const __m128i * ) ( data + 5 * 8 ) ) ;
row6 = _mm_load_si128 ( ( const __m128i * ) ( data + 6 * 8 ) ) ;
row7 = _mm_load_si128 ( ( const __m128i * ) ( data + 7 * 8 ) ) ;
/* column pass */
dct_pass ( bias_0 , 10 ) ;
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
/* 16bit 8x8 transpose pass 1 */
dct_interleave16 ( row0 , row4 ) ;
dct_interleave16 ( row1 , row5 ) ;
dct_interleave16 ( row2 , row6 ) ;
dct_interleave16 ( row3 , row7 ) ;
/* transpose pass 2 */
dct_interleave16 ( row0 , row2 ) ;
dct_interleave16 ( row1 , row3 ) ;
dct_interleave16 ( row4 , row6 ) ;
dct_interleave16 ( row5 , row7 ) ;
/* transpose pass 3 */
dct_interleave16 ( row0 , row1 ) ;
dct_interleave16 ( row2 , row3 ) ;
dct_interleave16 ( row4 , row5 ) ;
dct_interleave16 ( row6 , row7 ) ;
}
/* row pass */
dct_pass ( bias_1 , 17 ) ;
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
/* pack */
__m128i p0 = _mm_packus_epi16 ( row0 , row1 ) ; // a0a1a2a3...a7b0b1b2b3...b7
__m128i p1 = _mm_packus_epi16 ( row2 , row3 ) ;
__m128i p2 = _mm_packus_epi16 ( row4 , row5 ) ;
__m128i p3 = _mm_packus_epi16 ( row6 , row7 ) ;
// 8bit 8x8 transpose pass 1
dct_interleave8 ( p0 , p2 ) ; // a0e0a1e1...
dct_interleave8 ( p1 , p3 ) ; // c0g0c1g1...
// transpose pass 2
dct_interleave8 ( p0 , p1 ) ; // a0c0e0g0...
dct_interleave8 ( p2 , p3 ) ; // b0d0f0h0...
// transpose pass 3
dct_interleave8 ( p0 , p2 ) ; // a0b0c0d0...
dct_interleave8 ( p1 , p3 ) ; // a4b4c4d4...
// store
_mm_storel_epi64 ( ( __m128i * ) out , p0 ) ; out + = out_stride ;
_mm_storel_epi64 ( ( __m128i * ) out , _mm_shuffle_epi32 ( p0 , 0x4e ) ) ; out + = out_stride ;
_mm_storel_epi64 ( ( __m128i * ) out , p2 ) ; out + = out_stride ;
_mm_storel_epi64 ( ( __m128i * ) out , _mm_shuffle_epi32 ( p2 , 0x4e ) ) ; out + = out_stride ;
_mm_storel_epi64 ( ( __m128i * ) out , p1 ) ; out + = out_stride ;
_mm_storel_epi64 ( ( __m128i * ) out , _mm_shuffle_epi32 ( p1 , 0x4e ) ) ; out + = out_stride ;
_mm_storel_epi64 ( ( __m128i * ) out , p3 ) ; out + = out_stride ;
_mm_storel_epi64 ( ( __m128i * ) out , _mm_shuffle_epi32 ( p3 , 0x4e ) ) ;
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
# undef dct_const
# undef dct_rot
# undef dct_widen
# undef dct_wadd
# undef dct_wsub
# undef dct_bfly32o
# undef dct_interleave8
# undef dct_interleave16
# undef dct_pass
}
# endif /* STBI_SSE2 */
# ifdef STBI_NEON
/* NEON integer IDCT. should produce bit-identical
* results to the generic C version . */
static void stbi__idct_simd ( uint8_t * out , int out_stride , short data [ 64 ] )
{
int16x8_t row0 , row1 , row2 , row3 , row4 , row5 , row6 , row7 ;
int16x4_t rot0_0 = vdup_n_s16 ( stbi__f2f ( 0.5411961f ) ) ;
int16x4_t rot0_1 = vdup_n_s16 ( stbi__f2f ( - 1.847759065f ) ) ;
int16x4_t rot0_2 = vdup_n_s16 ( stbi__f2f ( 0.765366865f ) ) ;
int16x4_t rot1_0 = vdup_n_s16 ( stbi__f2f ( 1.175875602f ) ) ;
int16x4_t rot1_1 = vdup_n_s16 ( stbi__f2f ( - 0.899976223f ) ) ;
int16x4_t rot1_2 = vdup_n_s16 ( stbi__f2f ( - 2.562915447f ) ) ;
int16x4_t rot2_0 = vdup_n_s16 ( stbi__f2f ( - 1.961570560f ) ) ;
int16x4_t rot2_1 = vdup_n_s16 ( stbi__f2f ( - 0.390180644f ) ) ;
int16x4_t rot3_0 = vdup_n_s16 ( stbi__f2f ( 0.298631336f ) ) ;
int16x4_t rot3_1 = vdup_n_s16 ( stbi__f2f ( 2.053119869f ) ) ;
int16x4_t rot3_2 = vdup_n_s16 ( stbi__f2f ( 3.072711026f ) ) ;
int16x4_t rot3_3 = vdup_n_s16 ( stbi__f2f ( 1.501321110f ) ) ;
# define dct_long_mul(out, inq, coeff) \
int32x4_t out # # _l = vmull_s16 ( vget_low_s16 ( inq ) , coeff ) ; \
int32x4_t out # # _h = vmull_s16 ( vget_high_s16 ( inq ) , coeff )
# define dct_long_mac(out, acc, inq, coeff) \
int32x4_t out # # _l = vmlal_s16 ( acc # # _l , vget_low_s16 ( inq ) , coeff ) ; \
int32x4_t out # # _h = vmlal_s16 ( acc # # _h , vget_high_s16 ( inq ) , coeff )
# define dct_widen(out, inq) \
int32x4_t out # # _l = vshll_n_s16 ( vget_low_s16 ( inq ) , 12 ) ; \
int32x4_t out # # _h = vshll_n_s16 ( vget_high_s16 ( inq ) , 12 )
/* wide add */
# define dct_wadd(out, a, b) \
int32x4_t out # # _l = vaddq_s32 ( a # # _l , b # # _l ) ; \
int32x4_t out # # _h = vaddq_s32 ( a # # _h , b # # _h )
/* wide sub */
# define dct_wsub(out, a, b) \
int32x4_t out # # _l = vsubq_s32 ( a # # _l , b # # _l ) ; \
int32x4_t out # # _h = vsubq_s32 ( a # # _h , b # # _h )
// butterfly a/b, then shift using "shiftop" by "s" and pack
# define dct_bfly32o(out0,out1, a,b,shiftop,s) \
{ \
dct_wadd ( sum , a , b ) ; \
dct_wsub ( dif , a , b ) ; \
out0 = vcombine_s16 ( shiftop ( sum_l , s ) , shiftop ( sum_h , s ) ) ; \
out1 = vcombine_s16 ( shiftop ( dif_l , s ) , shiftop ( dif_h , s ) ) ; \
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
# define dct_pass(shiftop, shift) \
{ \
/* even part */ \
int16x8_t sum26 = vaddq_s16 ( row2 , row6 ) ; \
dct_long_mul ( p1e , sum26 , rot0_0 ) ; \
dct_long_mac ( t2e , p1e , row6 , rot0_1 ) ; \
dct_long_mac ( t3e , p1e , row2 , rot0_2 ) ; \
int16x8_t sum04 = vaddq_s16 ( row0 , row4 ) ; \
int16x8_t dif04 = vsubq_s16 ( row0 , row4 ) ; \
dct_widen ( t0e , sum04 ) ; \
dct_widen ( t1e , dif04 ) ; \
dct_wadd ( x0 , t0e , t3e ) ; \
dct_wsub ( x3 , t0e , t3e ) ; \
dct_wadd ( x1 , t1e , t2e ) ; \
dct_wsub ( x2 , t1e , t2e ) ; \
/* odd part */ \
int16x8_t sum15 = vaddq_s16 ( row1 , row5 ) ; \
int16x8_t sum17 = vaddq_s16 ( row1 , row7 ) ; \
int16x8_t sum35 = vaddq_s16 ( row3 , row5 ) ; \
int16x8_t sum37 = vaddq_s16 ( row3 , row7 ) ; \
int16x8_t sumodd = vaddq_s16 ( sum17 , sum35 ) ; \
dct_long_mul ( p5o , sumodd , rot1_0 ) ; \
dct_long_mac ( p1o , p5o , sum17 , rot1_1 ) ; \
dct_long_mac ( p2o , p5o , sum35 , rot1_2 ) ; \
dct_long_mul ( p3o , sum37 , rot2_0 ) ; \
dct_long_mul ( p4o , sum15 , rot2_1 ) ; \
dct_wadd ( sump13o , p1o , p3o ) ; \
dct_wadd ( sump24o , p2o , p4o ) ; \
dct_wadd ( sump23o , p2o , p3o ) ; \
dct_wadd ( sump14o , p1o , p4o ) ; \
dct_long_mac ( x4 , sump13o , row7 , rot3_0 ) ; \
dct_long_mac ( x5 , sump24o , row5 , rot3_1 ) ; \
dct_long_mac ( x6 , sump23o , row3 , rot3_2 ) ; \
dct_long_mac ( x7 , sump14o , row1 , rot3_3 ) ; \
dct_bfly32o ( row0 , row7 , x0 , x7 , shiftop , shift ) ; \
dct_bfly32o ( row1 , row6 , x1 , x6 , shiftop , shift ) ; \
dct_bfly32o ( row2 , row5 , x2 , x5 , shiftop , shift ) ; \
dct_bfly32o ( row3 , row4 , x3 , x4 , shiftop , shift ) ; \
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
// load
row0 = vld1q_s16 ( data + 0 * 8 ) ;
row1 = vld1q_s16 ( data + 1 * 8 ) ;
row2 = vld1q_s16 ( data + 2 * 8 ) ;
row3 = vld1q_s16 ( data + 3 * 8 ) ;
row4 = vld1q_s16 ( data + 4 * 8 ) ;
row5 = vld1q_s16 ( data + 5 * 8 ) ;
row6 = vld1q_s16 ( data + 6 * 8 ) ;
row7 = vld1q_s16 ( data + 7 * 8 ) ;
// add DC bias
row0 = vaddq_s16 ( row0 , vsetq_lane_s16 ( 1024 , vdupq_n_s16 ( 0 ) , 0 ) ) ;
// column pass
dct_pass ( vrshrn_n_s32 , 10 ) ;
// 16bit 8x8 transpose
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
// whether compilers actually get this is another story, sadly.
# define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
# define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
# define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
// pass 1
dct_trn16 ( row0 , row1 ) ; // a0b0a2b2a4b4a6b6
dct_trn16 ( row2 , row3 ) ;
dct_trn16 ( row4 , row5 ) ;
dct_trn16 ( row6 , row7 ) ;
// pass 2
dct_trn32 ( row0 , row2 ) ; // a0b0c0d0a4b4c4d4
dct_trn32 ( row1 , row3 ) ;
dct_trn32 ( row4 , row6 ) ;
dct_trn32 ( row5 , row7 ) ;
// pass 3
dct_trn64 ( row0 , row4 ) ; // a0b0c0d0e0f0g0h0
dct_trn64 ( row1 , row5 ) ;
dct_trn64 ( row2 , row6 ) ;
dct_trn64 ( row3 , row7 ) ;
# undef dct_trn16
# undef dct_trn32
# undef dct_trn64
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
// row pass
// vrshrn_n_s32 only supports shifts up to 16, we need
// 17. so do a non-rounding shift of 16 first then follow
// up with a rounding shift by 1.
dct_pass ( vshrn_n_s32 , 16 ) ;
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
{
/* pack and round */
uint8x8_t p0 = vqrshrun_n_s16 ( row0 , 1 ) ;
uint8x8_t p1 = vqrshrun_n_s16 ( row1 , 1 ) ;
uint8x8_t p2 = vqrshrun_n_s16 ( row2 , 1 ) ;
uint8x8_t p3 = vqrshrun_n_s16 ( row3 , 1 ) ;
uint8x8_t p4 = vqrshrun_n_s16 ( row4 , 1 ) ;
uint8x8_t p5 = vqrshrun_n_s16 ( row5 , 1 ) ;
uint8x8_t p6 = vqrshrun_n_s16 ( row6 , 1 ) ;
uint8x8_t p7 = vqrshrun_n_s16 ( row7 , 1 ) ;
/* again, these can translate into one instruction, but often don't. */
# define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
# define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
# define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
/* sadly can't use interleaved stores here since we only write
* 8 bytes to each scan line ! */
/* 8x8 8-bit transpose pass 1 */
dct_trn8_8 ( p0 , p1 ) ;
dct_trn8_8 ( p2 , p3 ) ;
dct_trn8_8 ( p4 , p5 ) ;
dct_trn8_8 ( p6 , p7 ) ;
/* pass 2 */
dct_trn8_16 ( p0 , p2 ) ;
dct_trn8_16 ( p1 , p3 ) ;
dct_trn8_16 ( p4 , p6 ) ;
dct_trn8_16 ( p5 , p7 ) ;
/* pass 3 */
dct_trn8_32 ( p0 , p4 ) ;
dct_trn8_32 ( p1 , p5 ) ;
dct_trn8_32 ( p2 , p6 ) ;
dct_trn8_32 ( p3 , p7 ) ;
/* store */
vst1_u8 ( out , p0 ) ; out + = out_stride ;
vst1_u8 ( out , p1 ) ; out + = out_stride ;
vst1_u8 ( out , p2 ) ; out + = out_stride ;
vst1_u8 ( out , p3 ) ; out + = out_stride ;
vst1_u8 ( out , p4 ) ; out + = out_stride ;
vst1_u8 ( out , p5 ) ; out + = out_stride ;
vst1_u8 ( out , p6 ) ; out + = out_stride ;
vst1_u8 ( out , p7 ) ;
# undef dct_trn8_8
# undef dct_trn8_16
# undef dct_trn8_32
}
# undef dct_long_mul
# undef dct_long_mac
# undef dct_widen
# undef dct_wadd
# undef dct_wsub
# undef dct_bfly32o
# undef dct_pass
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
# endif /* STBI_NEON */
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
# define STBI__MARKER_none 0xff
/* if there's a pending marker from the entropy stream, return that
* otherwise , fetch from the stream and get a marker . if there ' s no
* marker , return 0xff , which is never a valid marker value
*/
static uint8_t stbi__get_marker ( stbi__jpeg * j )
{
uint8_t x ;
if ( j - > marker ! = STBI__MARKER_none ) { x = j - > marker ; j - > marker = STBI__MARKER_none ; return x ; }
x = stbi__get8 ( j - > s ) ;
if ( x ! = 0xff ) return STBI__MARKER_none ;
while ( x = = 0xff )
x = stbi__get8 ( j - > s ) ;
return x ;
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
/* in each scan, we'll have scan_n components, and the order
* of the components is specified by order [ ]
*/
# define STBI__RESTART(x) ((x) >= 0xd0 && (x) <= 0xd7)
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
/* after a restart interval, stbi__jpeg_reset the entropy decoder and
* the dc prediction
*/
static void stbi__jpeg_reset ( stbi__jpeg * j )
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
j - > code_bits = 0 ;
j - > code_buffer = 0 ;
j - > nomore = 0 ;
j - > img_comp [ 0 ] . dc_pred = j - > img_comp [ 1 ] . dc_pred = j - > img_comp [ 2 ] . dc_pred = 0 ;
j - > marker = STBI__MARKER_none ;
j - > todo = j - > restart_interval ? j - > restart_interval : 0x7fffffff ;
j - > eob_run = 0 ;
// no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
// since we don't even allow 1<<30 pixels
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
static int stbi__parse_entropy_coded_data ( stbi__jpeg * z )
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
stbi__jpeg_reset ( z ) ;
if ( ! z - > progressive ) {
if ( z - > scan_n = = 1 ) {
int i , j ;
STBI_SIMD_ALIGN ( short , data [ 64 ] ) ;
int n = z - > order [ 0 ] ;
// non-interleaved data, we just need to process one block at a time,
// in trivial scanline order
// number of blocks to do just depends on how many actual "pixels" this
// component has, independent of interleaved MCU blocking and such
int w = ( z - > img_comp [ n ] . x + 7 ) > > 3 ;
int h = ( z - > img_comp [ n ] . y + 7 ) > > 3 ;
for ( j = 0 ; j < h ; + + j ) {
for ( i = 0 ; i < w ; + + i ) {
int ha = z - > img_comp [ n ] . ha ;
if ( ! stbi__jpeg_decode_block ( z , data , z - > huff_dc + z - > img_comp [ n ] . hd , z - > huff_ac + ha , z - > fast_ac [ ha ] , n , z - > dequant [ z - > img_comp [ n ] . tq ] ) ) return 0 ;
z - > idct_block_kernel ( z - > img_comp [ n ] . data + z - > img_comp [ n ] . w2 * j * 8 + i * 8 , z - > img_comp [ n ] . w2 , data ) ;
// every data block is an MCU, so countdown the restart interval
if ( - - z - > todo < = 0 ) {
if ( z - > code_bits < 24 ) stbi__grow_buffer_unsafe ( z ) ;
// if it's NOT a restart, then just bail, so we get corrupt data
// rather than no data
if ( ! STBI__RESTART ( z - > marker ) ) return 1 ;
stbi__jpeg_reset ( z ) ;
}
}
}
return 1 ;
} else { // interleaved
int i , j , k , x , y ;
STBI_SIMD_ALIGN ( short , data [ 64 ] ) ;
for ( j = 0 ; j < z - > img_mcu_y ; + + j ) {
for ( i = 0 ; i < z - > img_mcu_x ; + + i ) {
// scan an interleaved mcu... process scan_n components in order
for ( k = 0 ; k < z - > scan_n ; + + k ) {
int n = z - > order [ k ] ;
// scan out an mcu's worth of this component; that's just determined
// by the basic H and V specified for the component
for ( y = 0 ; y < z - > img_comp [ n ] . v ; + + y ) {
for ( x = 0 ; x < z - > img_comp [ n ] . h ; + + x ) {
int x2 = ( i * z - > img_comp [ n ] . h + x ) * 8 ;
int y2 = ( j * z - > img_comp [ n ] . v + y ) * 8 ;
int ha = z - > img_comp [ n ] . ha ;
if ( ! stbi__jpeg_decode_block ( z , data , z - > huff_dc + z - > img_comp [ n ] . hd , z - > huff_ac + ha , z - > fast_ac [ ha ] , n , z - > dequant [ z - > img_comp [ n ] . tq ] ) ) return 0 ;
z - > idct_block_kernel ( z - > img_comp [ n ] . data + z - > img_comp [ n ] . w2 * y2 + x2 , z - > img_comp [ n ] . w2 , data ) ;
}
}
}
// after all interleaved components, that's an interleaved MCU,
// so now count down the restart interval
if ( - - z - > todo < = 0 ) {
if ( z - > code_bits < 24 ) stbi__grow_buffer_unsafe ( z ) ;
if ( ! STBI__RESTART ( z - > marker ) ) return 1 ;
stbi__jpeg_reset ( z ) ;
}
}
}
return 1 ;
}
} else {
if ( z - > scan_n = = 1 ) {
int i , j ;
int n = z - > order [ 0 ] ;
// non-interleaved data, we just need to process one block at a time,
// in trivial scanline order
// number of blocks to do just depends on how many actual "pixels" this
// component has, independent of interleaved MCU blocking and such
int w = ( z - > img_comp [ n ] . x + 7 ) > > 3 ;
int h = ( z - > img_comp [ n ] . y + 7 ) > > 3 ;
for ( j = 0 ; j < h ; + + j ) {
for ( i = 0 ; i < w ; + + i ) {
short * data = z - > img_comp [ n ] . coeff + 64 * ( i + j * z - > img_comp [ n ] . coeff_w ) ;
if ( z - > spec_start = = 0 ) {
if ( ! stbi__jpeg_decode_block_prog_dc ( z , data , & z - > huff_dc [ z - > img_comp [ n ] . hd ] , n ) )
return 0 ;
} else {
int ha = z - > img_comp [ n ] . ha ;
if ( ! stbi__jpeg_decode_block_prog_ac ( z , data , & z - > huff_ac [ ha ] , z - > fast_ac [ ha ] ) )
return 0 ;
}
// every data block is an MCU, so countdown the restart interval
if ( - - z - > todo < = 0 ) {
if ( z - > code_bits < 24 ) stbi__grow_buffer_unsafe ( z ) ;
if ( ! STBI__RESTART ( z - > marker ) ) return 1 ;
stbi__jpeg_reset ( z ) ;
}
}
}
return 1 ;
} else { // interleaved
int i , j , k , x , y ;
for ( j = 0 ; j < z - > img_mcu_y ; + + j ) {
for ( i = 0 ; i < z - > img_mcu_x ; + + i ) {
// scan an interleaved mcu... process scan_n components in order
for ( k = 0 ; k < z - > scan_n ; + + k ) {
int n = z - > order [ k ] ;
// scan out an mcu's worth of this component; that's just determined
// by the basic H and V specified for the component
for ( y = 0 ; y < z - > img_comp [ n ] . v ; + + y ) {
for ( x = 0 ; x < z - > img_comp [ n ] . h ; + + x ) {
int x2 = ( i * z - > img_comp [ n ] . h + x ) ;
int y2 = ( j * z - > img_comp [ n ] . v + y ) ;
short * data = z - > img_comp [ n ] . coeff + 64 * ( x2 + y2 * z - > img_comp [ n ] . coeff_w ) ;
if ( ! stbi__jpeg_decode_block_prog_dc ( z , data , & z - > huff_dc [ z - > img_comp [ n ] . hd ] , n ) )
return 0 ;
}
}
}
// after all interleaved components, that's an interleaved MCU,
// so now count down the restart interval
if ( - - z - > todo < = 0 ) {
if ( z - > code_bits < 24 ) stbi__grow_buffer_unsafe ( z ) ;
if ( ! STBI__RESTART ( z - > marker ) ) return 1 ;
stbi__jpeg_reset ( z ) ;
}
}
}
return 1 ;
}
}
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
static void stbi__jpeg_dequantize ( short * data , uint8_t * dequant )
2016-05-07 04:33:54 +02:00
{
int i ;
2016-05-10 05:50:24 +02:00
for ( i = 0 ; i < 64 ; + + i )
data [ i ] * = dequant [ i ] ;
}
static void stbi__jpeg_finish ( stbi__jpeg * z )
{
if ( z - > progressive ) {
// dequantize and idct the data
int i , j , n ;
for ( n = 0 ; n < z - > s - > img_n ; + + n ) {
int w = ( z - > img_comp [ n ] . x + 7 ) > > 3 ;
int h = ( z - > img_comp [ n ] . y + 7 ) > > 3 ;
for ( j = 0 ; j < h ; + + j ) {
for ( i = 0 ; i < w ; + + i ) {
short * data = z - > img_comp [ n ] . coeff + 64 * ( i + j * z - > img_comp [ n ] . coeff_w ) ;
stbi__jpeg_dequantize ( data , z - > dequant [ z - > img_comp [ n ] . tq ] ) ;
z - > idct_block_kernel ( z - > img_comp [ n ] . data + z - > img_comp [ n ] . w2 * j * 8 + i * 8 , z - > img_comp [ n ] . w2 , data ) ;
}
}
}
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
static int stbi__process_marker ( stbi__jpeg * z , int m )
{
int L ;
switch ( m ) {
case STBI__MARKER_none : // no marker found
return stbi__err ( " expected marker " , " Corrupt JPEG " ) ;
case 0xDD : // DRI - specify restart interval
if ( stbi__get16be ( z - > s ) ! = 4 ) return stbi__err ( " bad DRI len " , " Corrupt JPEG " ) ;
z - > restart_interval = stbi__get16be ( z - > s ) ;
return 1 ;
case 0xDB : // DQT - define quantization table
L = stbi__get16be ( z - > s ) - 2 ;
while ( L > 0 ) {
int q = stbi__get8 ( z - > s ) ;
int p = q > > 4 ;
int t = q & 15 , i ;
if ( p ! = 0 ) return stbi__err ( " bad DQT type " , " Corrupt JPEG " ) ;
if ( t > 3 ) return stbi__err ( " bad DQT table " , " Corrupt JPEG " ) ;
for ( i = 0 ; i < 64 ; + + i )
z - > dequant [ t ] [ stbi__jpeg_dezigzag [ i ] ] = stbi__get8 ( z - > s ) ;
L - = 65 ;
}
return L = = 0 ;
case 0xC4 : // DHT - define huffman table
L = stbi__get16be ( z - > s ) - 2 ;
while ( L > 0 ) {
uint8_t * v ;
int sizes [ 16 ] , i , n = 0 ;
int q = stbi__get8 ( z - > s ) ;
int tc = q > > 4 ;
int th = q & 15 ;
if ( tc > 1 | | th > 3 ) return stbi__err ( " bad DHT header " , " Corrupt JPEG " ) ;
for ( i = 0 ; i < 16 ; + + i ) {
sizes [ i ] = stbi__get8 ( z - > s ) ;
n + = sizes [ i ] ;
}
L - = 17 ;
if ( tc = = 0 ) {
if ( ! stbi__build_huffman ( z - > huff_dc + th , sizes ) ) return 0 ;
v = z - > huff_dc [ th ] . values ;
} else {
if ( ! stbi__build_huffman ( z - > huff_ac + th , sizes ) ) return 0 ;
v = z - > huff_ac [ th ] . values ;
}
for ( i = 0 ; i < n ; + + i )
v [ i ] = stbi__get8 ( z - > s ) ;
if ( tc ! = 0 )
stbi__build_fast_ac ( z - > fast_ac [ th ] , z - > huff_ac + th ) ;
L - = n ;
}
return L = = 0 ;
}
// check for comment block or APP blocks
if ( ( m > = 0xE0 & & m < = 0xEF ) | | m = = 0xFE ) {
stbi__skip ( z - > s , stbi__get16be ( z - > s ) - 2 ) ;
return 1 ;
}
return 0 ;
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
// after we see SOS
static int stbi__process_scan_header ( stbi__jpeg * z )
{
int i ;
int Ls = stbi__get16be ( z - > s ) ;
z - > scan_n = stbi__get8 ( z - > s ) ;
if ( z - > scan_n < 1 | | z - > scan_n > 4 | | z - > scan_n > ( int ) z - > s - > img_n )
return stbi__err ( " bad SOS component count " , " Corrupt JPEG " ) ;
if ( Ls ! = 6 + 2 * z - > scan_n )
return stbi__err ( " bad SOS len " , " Corrupt JPEG " ) ;
for ( i = 0 ; i < z - > scan_n ; + + i )
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
int id = stbi__get8 ( z - > s ) , which ;
int q = stbi__get8 ( z - > s ) ;
for ( which = 0 ; which < z - > s - > img_n ; + + which )
if ( z - > img_comp [ which ] . id = = id )
break ;
if ( which = = z - > s - > img_n )
return 0 ; /* no match */
z - > img_comp [ which ] . hd = q > > 4 ; if ( z - > img_comp [ which ] . hd > 3 )
return stbi__err ( " bad DC huff " , " Corrupt JPEG " ) ;
z - > img_comp [ which ] . ha = q & 15 ; if ( z - > img_comp [ which ] . ha > 3 )
return stbi__err ( " bad AC huff " , " Corrupt JPEG " ) ;
z - > order [ i ] = which ;
2016-05-07 04:33:54 +02:00
}
{
2016-05-10 05:50:24 +02:00
int aa ;
z - > spec_start = stbi__get8 ( z - > s ) ;
z - > spec_end = stbi__get8 ( z - > s ) ; /* should be 63, but might be 0 */
aa = stbi__get8 ( z - > s ) ;
z - > succ_high = ( aa > > 4 ) ;
z - > succ_low = ( aa & 15 ) ;
if ( z - > progressive ) {
if ( z - > spec_start > 63 | | z - > spec_end > 63 | | z - > spec_start > z - > spec_end | | z - > succ_high > 13 | | z - > succ_low > 13 )
return stbi__err ( " bad SOS " , " Corrupt JPEG " ) ;
} else {
if ( z - > spec_start ! = 0 ) return stbi__err ( " bad SOS " , " Corrupt JPEG " ) ;
if ( z - > succ_high ! = 0 | | z - > succ_low ! = 0 ) return stbi__err ( " bad SOS " , " Corrupt JPEG " ) ;
z - > spec_end = 63 ;
}
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
return 1 ;
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
static int stbi__process_frame_header ( stbi__jpeg * z , int scan )
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
stbi__context * s = z - > s ;
int Lf , p , i , q , h_max = 1 , v_max = 1 , c ;
Lf = stbi__get16be ( s ) ; if ( Lf < 11 ) return stbi__err ( " bad SOF len " , " Corrupt JPEG " ) ; // JPEG
p = stbi__get8 ( s ) ; if ( p ! = 8 ) return stbi__err ( " only 8-bit " , " JPEG format not supported: 8-bit only " ) ; // JPEG baseline
s - > img_y = stbi__get16be ( s ) ; if ( s - > img_y = = 0 ) return stbi__err ( " no header height " , " JPEG format not supported: delayed height " ) ; // Legal, but we don't handle it--but neither does IJG
s - > img_x = stbi__get16be ( s ) ; if ( s - > img_x = = 0 ) return stbi__err ( " 0 width " , " Corrupt JPEG " ) ; // JPEG requires
c = stbi__get8 ( s ) ;
if ( c ! = 3 & & c ! = 1 ) return stbi__err ( " bad component count " , " Corrupt JPEG " ) ; // JFIF requires
s - > img_n = c ;
for ( i = 0 ; i < c ; + + i ) {
z - > img_comp [ i ] . data = NULL ;
z - > img_comp [ i ] . linebuf = NULL ;
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
if ( Lf ! = 8 + 3 * s - > img_n ) return stbi__err ( " bad SOF len " , " Corrupt JPEG " ) ;
for ( i = 0 ; i < s - > img_n ; + + i ) {
z - > img_comp [ i ] . id = stbi__get8 ( s ) ;
if ( z - > img_comp [ i ] . id ! = i + 1 ) // JFIF requires
if ( z - > img_comp [ i ] . id ! = i ) // some version of jpegtran outputs non-JFIF-compliant files!
return stbi__err ( " bad component ID " , " Corrupt JPEG " ) ;
q = stbi__get8 ( s ) ;
z - > img_comp [ i ] . h = ( q > > 4 ) ; if ( ! z - > img_comp [ i ] . h | | z - > img_comp [ i ] . h > 4 ) return stbi__err ( " bad H " , " Corrupt JPEG " ) ;
z - > img_comp [ i ] . v = q & 15 ; if ( ! z - > img_comp [ i ] . v | | z - > img_comp [ i ] . v > 4 ) return stbi__err ( " bad V " , " Corrupt JPEG " ) ;
z - > img_comp [ i ] . tq = stbi__get8 ( s ) ; if ( z - > img_comp [ i ] . tq > 3 ) return stbi__err ( " bad TQ " , " Corrupt JPEG " ) ;
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
if ( scan ! = STBI__SCAN_load ) return 1 ;
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
if ( ( 1 < < 30 ) / s - > img_x / s - > img_n < s - > img_y ) return stbi__err ( " too large " , " Image too large to decode " ) ;
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
for ( i = 0 ; i < s - > img_n ; + + i ) {
if ( z - > img_comp [ i ] . h > h_max ) h_max = z - > img_comp [ i ] . h ;
if ( z - > img_comp [ i ] . v > v_max ) v_max = z - > img_comp [ i ] . v ;
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
// compute interleaved mcu info
z - > img_h_max = h_max ;
z - > img_v_max = v_max ;
z - > img_mcu_w = h_max * 8 ;
z - > img_mcu_h = v_max * 8 ;
z - > img_mcu_x = ( s - > img_x + z - > img_mcu_w - 1 ) / z - > img_mcu_w ;
z - > img_mcu_y = ( s - > img_y + z - > img_mcu_h - 1 ) / z - > img_mcu_h ;
for ( i = 0 ; i < s - > img_n ; + + i ) {
// number of effective pixels (e.g. for non-interleaved MCU)
z - > img_comp [ i ] . x = ( s - > img_x * z - > img_comp [ i ] . h + h_max - 1 ) / h_max ;
z - > img_comp [ i ] . y = ( s - > img_y * z - > img_comp [ i ] . v + v_max - 1 ) / v_max ;
// to simplify generation, we'll allocate enough memory to decode
// the bogus oversized data from using interleaved MCUs and their
// big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
// discard the extra data until colorspace conversion
z - > img_comp [ i ] . w2 = z - > img_mcu_x * z - > img_comp [ i ] . h * 8 ;
z - > img_comp [ i ] . h2 = z - > img_mcu_y * z - > img_comp [ i ] . v * 8 ;
z - > img_comp [ i ] . raw_data = malloc ( z - > img_comp [ i ] . w2 * z - > img_comp [ i ] . h2 + 15 ) ;
if ( z - > img_comp [ i ] . raw_data = = NULL ) {
for ( - - i ; i > = 0 ; - - i ) {
free ( z - > img_comp [ i ] . raw_data ) ;
z - > img_comp [ i ] . data = NULL ;
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
return stbi__err ( " outofmem " , " Out of memory " ) ;
}
// align blocks for idct using mmx/sse
z - > img_comp [ i ] . data = ( uint8_t * ) ( ( ( size_t ) z - > img_comp [ i ] . raw_data + 15 ) & ~ 15 ) ;
z - > img_comp [ i ] . linebuf = NULL ;
if ( z - > progressive ) {
z - > img_comp [ i ] . coeff_w = ( z - > img_comp [ i ] . w2 + 7 ) > > 3 ;
z - > img_comp [ i ] . coeff_h = ( z - > img_comp [ i ] . h2 + 7 ) > > 3 ;
z - > img_comp [ i ] . raw_coeff = malloc ( z - > img_comp [ i ] . coeff_w * z - > img_comp [ i ] . coeff_h * 64 * sizeof ( short ) + 15 ) ;
z - > img_comp [ i ] . coeff = ( short * ) ( ( ( size_t ) z - > img_comp [ i ] . raw_coeff + 15 ) & ~ 15 ) ;
} else {
z - > img_comp [ i ] . coeff = 0 ;
z - > img_comp [ i ] . raw_coeff = 0 ;
2016-05-07 04:33:54 +02:00
}
}
2016-05-10 05:50:24 +02:00
return 1 ;
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
// use comparisons since in some cases we handle more than one case (e.g. SOF)
# define stbi__DNL(x) ((x) == 0xdc)
# define stbi__SOI(x) ((x) == 0xd8)
# define stbi__EOI(x) ((x) == 0xd9)
# define stbi__SOF(x) ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
# define stbi__SOS(x) ((x) == 0xda)
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
# define stbi__SOF_progressive(x) ((x) == 0xc2)
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
static int stbi__decode_jpeg_header ( stbi__jpeg * z , int scan )
{
int m ;
z - > marker = STBI__MARKER_none ; // initialize cached marker to empty
m = stbi__get_marker ( z ) ;
if ( ! stbi__SOI ( m ) ) return stbi__err ( " no SOI " , " Corrupt JPEG " ) ;
if ( scan = = STBI__SCAN_type ) return 1 ;
m = stbi__get_marker ( z ) ;
while ( ! stbi__SOF ( m ) ) {
if ( ! stbi__process_marker ( z , m ) ) return 0 ;
m = stbi__get_marker ( z ) ;
while ( m = = STBI__MARKER_none ) {
// some files have extra padding after their blocks, so ok, we'll scan
if ( stbi__at_eof ( z - > s ) ) return stbi__err ( " no SOF " , " Corrupt JPEG " ) ;
m = stbi__get_marker ( z ) ;
}
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
z - > progressive = stbi__SOF_progressive ( m ) ;
if ( ! stbi__process_frame_header ( z , scan ) ) return 0 ;
return 1 ;
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
// decode image to YCbCr format
static int stbi__decode_jpeg_image ( stbi__jpeg * j )
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
int m ;
for ( m = 0 ; m < 4 ; m + + ) {
j - > img_comp [ m ] . raw_data = NULL ;
j - > img_comp [ m ] . raw_coeff = NULL ;
}
j - > restart_interval = 0 ;
if ( ! stbi__decode_jpeg_header ( j , STBI__SCAN_load ) ) return 0 ;
m = stbi__get_marker ( j ) ;
while ( ! stbi__EOI ( m ) ) {
if ( stbi__SOS ( m ) ) {
if ( ! stbi__process_scan_header ( j ) ) return 0 ;
if ( ! stbi__parse_entropy_coded_data ( j ) ) return 0 ;
if ( j - > marker = = STBI__MARKER_none ) {
// handle 0s at the end of image data from IP Kamera 9060
while ( ! stbi__at_eof ( j - > s ) ) {
int x = stbi__get8 ( j - > s ) ;
if ( x = = 255 ) {
j - > marker = stbi__get8 ( j - > s ) ;
break ;
} else if ( x ! = 0 ) {
return stbi__err ( " junk before marker " , " Corrupt JPEG " ) ;
}
}
// if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
}
} else {
if ( ! stbi__process_marker ( j , m ) ) return 0 ;
}
m = stbi__get_marker ( j ) ;
}
if ( j - > progressive )
stbi__jpeg_finish ( j ) ;
return 1 ;
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
// static jfif-centered resampling (across block boundaries)
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
typedef uint8_t * ( * resample_row_func ) ( uint8_t * out , uint8_t * in0 , uint8_t * in1 ,
int w , int hs ) ;
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
# define stbi__div4(x) ((uint8_t) ((x) >> 2))
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
static uint8_t * resample_row_1 ( uint8_t * out , uint8_t * in_near , uint8_t * in_far , int w , int hs )
{
STBI_NOTUSED ( out ) ;
STBI_NOTUSED ( in_far ) ;
STBI_NOTUSED ( w ) ;
STBI_NOTUSED ( hs ) ;
return in_near ;
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
static uint8_t * stbi__resample_row_v_2 ( uint8_t * out , uint8_t * in_near , uint8_t * in_far , int w , int hs )
{
// need to generate two samples vertically for every one in input
int i ;
STBI_NOTUSED ( hs ) ;
for ( i = 0 ; i < w ; + + i )
out [ i ] = stbi__div4 ( 3 * in_near [ i ] + in_far [ i ] + 2 ) ;
return out ;
}
static uint8_t * stbi__resample_row_h_2 ( uint8_t * out , uint8_t * in_near , uint8_t * in_far , int w , int hs )
{
// need to generate two samples horizontally for every one in input
int i ;
uint8_t * input = in_near ;
if ( w = = 1 ) {
// if only one sample, can't do any interpolation
out [ 0 ] = out [ 1 ] = input [ 0 ] ;
return out ;
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
out [ 0 ] = input [ 0 ] ;
out [ 1 ] = stbi__div4 ( input [ 0 ] * 3 + input [ 1 ] + 2 ) ;
for ( i = 1 ; i < w - 1 ; + + i ) {
int n = 3 * input [ i ] + 2 ;
out [ i * 2 + 0 ] = stbi__div4 ( n + input [ i - 1 ] ) ;
out [ i * 2 + 1 ] = stbi__div4 ( n + input [ i + 1 ] ) ;
}
out [ i * 2 + 0 ] = stbi__div4 ( input [ w - 2 ] * 3 + input [ w - 1 ] + 2 ) ;
out [ i * 2 + 1 ] = input [ w - 1 ] ;
STBI_NOTUSED ( in_far ) ;
STBI_NOTUSED ( hs ) ;
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
return out ;
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
# define stbi__div16(x) ((uint8_t) ((x) >> 4))
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
static uint8_t * stbi__resample_row_hv_2 ( uint8_t * out , uint8_t * in_near , uint8_t * in_far , int w , int hs )
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
// need to generate 2x2 samples for every one in input
int i , t0 , t1 ;
if ( w = = 1 ) {
out [ 0 ] = out [ 1 ] = stbi__div4 ( 3 * in_near [ 0 ] + in_far [ 0 ] + 2 ) ;
return out ;
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
t1 = 3 * in_near [ 0 ] + in_far [ 0 ] ;
out [ 0 ] = stbi__div4 ( t1 + 2 ) ;
for ( i = 1 ; i < w ; + + i ) {
t0 = t1 ;
t1 = 3 * in_near [ i ] + in_far [ i ] ;
out [ i * 2 - 1 ] = stbi__div16 ( 3 * t0 + t1 + 8 ) ;
out [ i * 2 ] = stbi__div16 ( 3 * t1 + t0 + 8 ) ;
}
out [ w * 2 - 1 ] = stbi__div4 ( t1 + 2 ) ;
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
STBI_NOTUSED ( hs ) ;
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
return out ;
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
# if defined(STBI_SSE2) || defined(STBI_NEON)
static uint8_t * stbi__resample_row_hv_2_simd ( uint8_t * out , uint8_t * in_near , uint8_t * in_far , int w , int hs )
{
/* need to generate 2x2 samples for every one in input */
int i = 0 , t0 , t1 ;
if ( w = = 1 ) {
out [ 0 ] = out [ 1 ] = stbi__div4 ( 3 * in_near [ 0 ] + in_far [ 0 ] + 2 ) ;
return out ;
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
t1 = 3 * in_near [ 0 ] + in_far [ 0 ] ;
/* process groups of 8 pixels for as long as we can.
* note we can ' t handle the last pixel in a row in this loop
* because we need to handle the filter boundary conditions .
*/
for ( ; i < ( ( w - 1 ) & ~ 7 ) ; i + = 8 )
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
# if defined(STBI_SSE2)
/* load and perform the vertical filtering pass
* this uses 3 * x + y = 4 * x + ( y - x ) */
__m128i zero = _mm_setzero_si128 ( ) ;
__m128i farb = _mm_loadl_epi64 ( ( __m128i * ) ( in_far + i ) ) ;
__m128i nearb = _mm_loadl_epi64 ( ( __m128i * ) ( in_near + i ) ) ;
__m128i farw = _mm_unpacklo_epi8 ( farb , zero ) ;
__m128i nearw = _mm_unpacklo_epi8 ( nearb , zero ) ;
__m128i diff = _mm_sub_epi16 ( farw , nearw ) ;
__m128i nears = _mm_slli_epi16 ( nearw , 2 ) ;
__m128i curr = _mm_add_epi16 ( nears , diff ) ; /* current row */
/* horizontal filter works the same based on shifted vers of current
* row . " prev " is current row shifted right by 1 pixel ; we need to
* insert the previous pixel value ( from t1 ) .
* " next " is current row shifted left by 1 pixel , with first pixel
* of next block of 8 pixels added in .
*/
__m128i prv0 = _mm_slli_si128 ( curr , 2 ) ;
__m128i nxt0 = _mm_srli_si128 ( curr , 2 ) ;
__m128i prev = _mm_insert_epi16 ( prv0 , t1 , 0 ) ;
__m128i next = _mm_insert_epi16 ( nxt0 , 3 * in_near [ i + 8 ] + in_far [ i + 8 ] , 7 ) ;
/* horizontal filter, polyphase implementation since it's convenient:
* even pixels = 3 * cur + prev = cur * 4 + ( prev - cur )
* odd pixels = 3 * cur + next = cur * 4 + ( next - cur )
* note the shared term . */
__m128i bias = _mm_set1_epi16 ( 8 ) ;
__m128i curs = _mm_slli_epi16 ( curr , 2 ) ;
__m128i prvd = _mm_sub_epi16 ( prev , curr ) ;
__m128i nxtd = _mm_sub_epi16 ( next , curr ) ;
__m128i curb = _mm_add_epi16 ( curs , bias ) ;
__m128i even = _mm_add_epi16 ( prvd , curb ) ;
__m128i odd = _mm_add_epi16 ( nxtd , curb ) ;
/* interleave even and odd pixels, then undo scaling. */
__m128i int0 = _mm_unpacklo_epi16 ( even , odd ) ;
__m128i int1 = _mm_unpackhi_epi16 ( even , odd ) ;
__m128i de0 = _mm_srli_epi16 ( int0 , 4 ) ;
__m128i de1 = _mm_srli_epi16 ( int1 , 4 ) ;
/* pack and write output */
__m128i outv = _mm_packus_epi16 ( de0 , de1 ) ;
_mm_storeu_si128 ( ( __m128i * ) ( out + i * 2 ) , outv ) ;
# elif defined(STBI_NEON)
// load and perform the vertical filtering pass
// this uses 3*x + y = 4*x + (y - x)
uint8x8_t farb = vld1_u8 ( in_far + i ) ;
uint8x8_t nearb = vld1_u8 ( in_near + i ) ;
int16x8_t diff = vreinterpretq_s16_u16 ( vsubl_u8 ( farb , nearb ) ) ;
int16x8_t nears = vreinterpretq_s16_u16 ( vshll_n_u8 ( nearb , 2 ) ) ;
int16x8_t curr = vaddq_s16 ( nears , diff ) ; // current row
// horizontal filter works the same based on shifted vers of current
// row. "prev" is current row shifted right by 1 pixel; we need to
// insert the previous pixel value (from t1).
// "next" is current row shifted left by 1 pixel, with first pixel
// of next block of 8 pixels added in.
int16x8_t prv0 = vextq_s16 ( curr , curr , 7 ) ;
int16x8_t nxt0 = vextq_s16 ( curr , curr , 1 ) ;
int16x8_t prev = vsetq_lane_s16 ( t1 , prv0 , 0 ) ;
int16x8_t next = vsetq_lane_s16 ( 3 * in_near [ i + 8 ] + in_far [ i + 8 ] , nxt0 , 7 ) ;
/* horizontal filter, polyphase implementation since it's convenient:
* even pixels = 3 * cur + prev = cur * 4 + ( prev - cur )
* odd pixels = 3 * cur + next = cur * 4 + ( next - cur )
* note the shared term .
*/
int16x8_t curs = vshlq_n_s16 ( curr , 2 ) ;
int16x8_t prvd = vsubq_s16 ( prev , curr ) ;
int16x8_t nxtd = vsubq_s16 ( next , curr ) ;
int16x8_t even = vaddq_s16 ( curs , prvd ) ;
int16x8_t odd = vaddq_s16 ( curs , nxtd ) ;
/* undo scaling and round, then store with even/odd phases interleaved */
uint8x8x2_t o ;
o . val [ 0 ] = vqrshrun_n_s16 ( even , 4 ) ;
o . val [ 1 ] = vqrshrun_n_s16 ( odd , 4 ) ;
vst2_u8 ( out + i * 2 , o ) ;
# endif
/* "previous" value for next iteration */
t1 = 3 * in_near [ i + 7 ] + in_far [ i + 7 ] ;
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
t0 = t1 ;
t1 = 3 * in_near [ i ] + in_far [ i ] ;
out [ i * 2 ] = stbi__div16 ( 3 * t1 + t0 + 8 ) ;
for ( + + i ; i < w ; + + i ) {
t0 = t1 ;
t1 = 3 * in_near [ i ] + in_far [ i ] ;
out [ i * 2 - 1 ] = stbi__div16 ( 3 * t0 + t1 + 8 ) ;
out [ i * 2 ] = stbi__div16 ( 3 * t1 + t0 + 8 ) ;
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
out [ w * 2 - 1 ] = stbi__div4 ( t1 + 2 ) ;
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
STBI_NOTUSED ( hs ) ;
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
return out ;
}
# endif
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
static uint8_t * stbi__resample_row_generic ( uint8_t * out , uint8_t * in_near , uint8_t * in_far , int w , int hs )
{
/* resample with nearest-neighbor */
int i , j ;
STBI_NOTUSED ( in_far ) ;
for ( i = 0 ; i < w ; + + i )
for ( j = 0 ; j < hs ; + + j )
out [ i * hs + j ] = in_near [ i ] ;
return out ;
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
# ifdef STBI_JPEG_OLD
/* this is the same YCbCr-to-RGB calculation that stb_image has used
* historically before the algorithm changes in 1.49 */
# define float2fixed(x) ((int) ((x) * 65536 + 0.5))
static void stbi__YCbCr_to_RGB_row ( uint8_t * out , const uint8_t * y , const uint8_t * pcb , const uint8_t * pcr , int count , int step )
{
int i ;
for ( i = 0 ; i < count ; + + i ) {
int y_fixed = ( y [ i ] < < 16 ) + 32768 ; // rounding
int r , g , b ;
int cr = pcr [ i ] - 128 ;
int cb = pcb [ i ] - 128 ;
r = y_fixed + cr * float2fixed ( 1.40200f ) ;
g = y_fixed - cr * float2fixed ( 0.71414f ) - cb * float2fixed ( 0.34414f ) ;
b = y_fixed + cb * float2fixed ( 1.77200f ) ;
r > > = 16 ;
g > > = 16 ;
b > > = 16 ;
if ( ( unsigned ) r > 255 ) { if ( r < 0 ) r = 0 ; else r = 255 ; }
if ( ( unsigned ) g > 255 ) { if ( g < 0 ) g = 0 ; else g = 255 ; }
if ( ( unsigned ) b > 255 ) { if ( b < 0 ) b = 0 ; else b = 255 ; }
out [ 0 ] = ( uint8_t ) r ;
out [ 1 ] = ( uint8_t ) g ;
out [ 2 ] = ( uint8_t ) b ;
out [ 3 ] = 255 ;
out + = step ;
}
}
# else
/* this is a reduced-precision calculation of YCbCr-to-RGB introduced
* to make sure the code produces the same results in both SIMD and scalar */
# define float2fixed(x) (((int) ((x) * 4096.0f + 0.5f)) << 8)
static void stbi__YCbCr_to_RGB_row ( uint8_t * out , const uint8_t * y , const uint8_t * pcb , const uint8_t * pcr , int count , int step )
{
int i ;
for ( i = 0 ; i < count ; + + i ) {
int y_fixed = ( y [ i ] < < 20 ) + ( 1 < < 19 ) ; /* rounding */
int r , g , b ;
int cr = pcr [ i ] - 128 ;
int cb = pcb [ i ] - 128 ;
r = y_fixed + cr * float2fixed ( 1.40200f ) ;
g = y_fixed + ( cr * - float2fixed ( 0.71414f ) ) + ( ( cb * - float2fixed ( 0.34414f ) ) & 0xffff0000 ) ;
b = y_fixed + cb * float2fixed ( 1.77200f ) ;
r > > = 20 ;
g > > = 20 ;
b > > = 20 ;
if ( ( unsigned ) r > 255 ) { if ( r < 0 ) r = 0 ; else r = 255 ; }
if ( ( unsigned ) g > 255 ) { if ( g < 0 ) g = 0 ; else g = 255 ; }
if ( ( unsigned ) b > 255 ) { if ( b < 0 ) b = 0 ; else b = 255 ; }
out [ 0 ] = ( uint8_t ) r ;
out [ 1 ] = ( uint8_t ) g ;
out [ 2 ] = ( uint8_t ) b ;
out [ 3 ] = 255 ;
out + = step ;
}
}
# endif
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
# if defined(STBI_SSE2) || defined(STBI_NEON)
static void stbi__YCbCr_to_RGB_simd ( uint8_t * out , const uint8_t * y , const uint8_t * pcb , const uint8_t * pcr , int count , int step )
{
int i = 0 ;
# ifdef STBI_SSE2
/* step == 3 is pretty ugly on the final interleave, and i'm not convinced
* it ' s useful in practice ( you wouldn ' t use it for textures , for example ) .
* so just accelerate step = = 4 case .
*/
if ( step = = 4 )
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
/* this is a fairly straightforward implementation and not super-optimized. */
__m128i signflip = _mm_set1_epi8 ( - 0x80 ) ;
__m128i cr_const0 = _mm_set1_epi16 ( ( short ) ( 1.40200f * 4096.0f + 0.5f ) ) ;
__m128i cr_const1 = _mm_set1_epi16 ( - ( short ) ( 0.71414f * 4096.0f + 0.5f ) ) ;
__m128i cb_const0 = _mm_set1_epi16 ( - ( short ) ( 0.34414f * 4096.0f + 0.5f ) ) ;
__m128i cb_const1 = _mm_set1_epi16 ( ( short ) ( 1.77200f * 4096.0f + 0.5f ) ) ;
__m128i y_bias = _mm_set1_epi8 ( ( char ) ( unsigned char ) 128 ) ;
__m128i xw = _mm_set1_epi16 ( 255 ) ; /* alpha channel */
for ( ; i + 7 < count ; i + = 8 )
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
// load
__m128i y_bytes = _mm_loadl_epi64 ( ( __m128i * ) ( y + i ) ) ;
__m128i cr_bytes = _mm_loadl_epi64 ( ( __m128i * ) ( pcr + i ) ) ;
__m128i cb_bytes = _mm_loadl_epi64 ( ( __m128i * ) ( pcb + i ) ) ;
__m128i cr_biased = _mm_xor_si128 ( cr_bytes , signflip ) ; // -128
__m128i cb_biased = _mm_xor_si128 ( cb_bytes , signflip ) ; // -128
// unpack to short (and left-shift cr, cb by 8)
__m128i yw = _mm_unpacklo_epi8 ( y_bias , y_bytes ) ;
__m128i crw = _mm_unpacklo_epi8 ( _mm_setzero_si128 ( ) , cr_biased ) ;
__m128i cbw = _mm_unpacklo_epi8 ( _mm_setzero_si128 ( ) , cb_biased ) ;
// color transform
__m128i yws = _mm_srli_epi16 ( yw , 4 ) ;
__m128i cr0 = _mm_mulhi_epi16 ( cr_const0 , crw ) ;
__m128i cb0 = _mm_mulhi_epi16 ( cb_const0 , cbw ) ;
__m128i cb1 = _mm_mulhi_epi16 ( cbw , cb_const1 ) ;
__m128i cr1 = _mm_mulhi_epi16 ( crw , cr_const1 ) ;
__m128i rws = _mm_add_epi16 ( cr0 , yws ) ;
__m128i gwt = _mm_add_epi16 ( cb0 , yws ) ;
__m128i bws = _mm_add_epi16 ( yws , cb1 ) ;
__m128i gws = _mm_add_epi16 ( gwt , cr1 ) ;
// descale
__m128i rw = _mm_srai_epi16 ( rws , 4 ) ;
__m128i bw = _mm_srai_epi16 ( bws , 4 ) ;
__m128i gw = _mm_srai_epi16 ( gws , 4 ) ;
// back to byte, set up for transpose
__m128i brb = _mm_packus_epi16 ( rw , bw ) ;
__m128i gxb = _mm_packus_epi16 ( gw , xw ) ;
// transpose to interleave channels
__m128i t0 = _mm_unpacklo_epi8 ( brb , gxb ) ;
__m128i t1 = _mm_unpackhi_epi8 ( brb , gxb ) ;
__m128i o0 = _mm_unpacklo_epi16 ( t0 , t1 ) ;
__m128i o1 = _mm_unpackhi_epi16 ( t0 , t1 ) ;
// store
_mm_storeu_si128 ( ( __m128i * ) ( out + 0 ) , o0 ) ;
_mm_storeu_si128 ( ( __m128i * ) ( out + 16 ) , o1 ) ;
out + = 32 ;
2016-05-07 04:33:54 +02:00
}
}
2016-05-10 05:50:24 +02:00
# endif
# ifdef STBI_NEON
// in this version, step=3 support would be easy to add. but is there demand?
if ( step = = 4 ) {
// this is a fairly straightforward implementation and not super-optimized.
uint8x8_t signflip = vdup_n_u8 ( 0x80 ) ;
int16x8_t cr_const0 = vdupq_n_s16 ( ( short ) ( 1.40200f * 4096.0f + 0.5f ) ) ;
int16x8_t cr_const1 = vdupq_n_s16 ( - ( short ) ( 0.71414f * 4096.0f + 0.5f ) ) ;
int16x8_t cb_const0 = vdupq_n_s16 ( - ( short ) ( 0.34414f * 4096.0f + 0.5f ) ) ;
int16x8_t cb_const1 = vdupq_n_s16 ( ( short ) ( 1.77200f * 4096.0f + 0.5f ) ) ;
for ( ; i + 7 < count ; i + = 8 ) {
// load
uint8x8_t y_bytes = vld1_u8 ( y + i ) ;
uint8x8_t cr_bytes = vld1_u8 ( pcr + i ) ;
uint8x8_t cb_bytes = vld1_u8 ( pcb + i ) ;
int8x8_t cr_biased = vreinterpret_s8_u8 ( vsub_u8 ( cr_bytes , signflip ) ) ;
int8x8_t cb_biased = vreinterpret_s8_u8 ( vsub_u8 ( cb_bytes , signflip ) ) ;
// expand to s16
int16x8_t yws = vreinterpretq_s16_u16 ( vshll_n_u8 ( y_bytes , 4 ) ) ;
int16x8_t crw = vshll_n_s8 ( cr_biased , 7 ) ;
int16x8_t cbw = vshll_n_s8 ( cb_biased , 7 ) ;
// color transform
int16x8_t cr0 = vqdmulhq_s16 ( crw , cr_const0 ) ;
int16x8_t cb0 = vqdmulhq_s16 ( cbw , cb_const0 ) ;
int16x8_t cr1 = vqdmulhq_s16 ( crw , cr_const1 ) ;
int16x8_t cb1 = vqdmulhq_s16 ( cbw , cb_const1 ) ;
int16x8_t rws = vaddq_s16 ( yws , cr0 ) ;
int16x8_t gws = vaddq_s16 ( vaddq_s16 ( yws , cb0 ) , cr1 ) ;
int16x8_t bws = vaddq_s16 ( yws , cb1 ) ;
// undo scaling, round, convert to byte
uint8x8x4_t o ;
o . val [ 0 ] = vqrshrun_n_s16 ( rws , 4 ) ;
o . val [ 1 ] = vqrshrun_n_s16 ( gws , 4 ) ;
o . val [ 2 ] = vqrshrun_n_s16 ( bws , 4 ) ;
o . val [ 3 ] = vdup_n_u8 ( 255 ) ;
// store, interleaving r/g/b/a
vst4_u8 ( out , o ) ;
out + = 8 * 4 ;
}
}
# endif
for ( ; i < count ; + + i ) {
int y_fixed = ( y [ i ] < < 20 ) + ( 1 < < 19 ) ; // rounding
int r , g , b ;
int cr = pcr [ i ] - 128 ;
int cb = pcb [ i ] - 128 ;
r = y_fixed + cr * float2fixed ( 1.40200f ) ;
g = y_fixed + cr * - float2fixed ( 0.71414f ) + ( ( cb * - float2fixed ( 0.34414f ) ) & 0xffff0000 ) ;
b = y_fixed + cb * float2fixed ( 1.77200f ) ;
r > > = 20 ;
g > > = 20 ;
b > > = 20 ;
if ( ( unsigned ) r > 255 ) { if ( r < 0 ) r = 0 ; else r = 255 ; }
if ( ( unsigned ) g > 255 ) { if ( g < 0 ) g = 0 ; else g = 255 ; }
if ( ( unsigned ) b > 255 ) { if ( b < 0 ) b = 0 ; else b = 255 ; }
out [ 0 ] = ( uint8_t ) r ;
out [ 1 ] = ( uint8_t ) g ;
out [ 2 ] = ( uint8_t ) b ;
out [ 3 ] = 255 ;
out + = step ;
}
}
# endif
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
/* set up the kernels */
static void stbi__setup_jpeg ( stbi__jpeg * j )
{
j - > idct_block_kernel = stbi__idct_block ;
j - > YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row ;
j - > resample_row_hv_2_kernel = stbi__resample_row_hv_2 ;
# ifdef STBI_SSE2
if ( stbi__sse2_available ( ) ) {
j - > idct_block_kernel = stbi__idct_simd ;
# ifndef STBI_JPEG_OLD
j - > YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd ;
# endif
j - > resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd ;
}
# endif
# ifdef STBI_NEON
j - > idct_block_kernel = stbi__idct_simd ;
# ifndef STBI_JPEG_OLD
j - > YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd ;
# endif
j - > resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd ;
# endif
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
/* clean up the temporary component buffers */
static void stbi__cleanup_jpeg ( stbi__jpeg * j )
{
int i ;
for ( i = 0 ; i < j - > s - > img_n ; + + i ) {
if ( j - > img_comp [ i ] . raw_data ) {
free ( j - > img_comp [ i ] . raw_data ) ;
j - > img_comp [ i ] . raw_data = NULL ;
j - > img_comp [ i ] . data = NULL ;
}
if ( j - > img_comp [ i ] . raw_coeff ) {
free ( j - > img_comp [ i ] . raw_coeff ) ;
j - > img_comp [ i ] . raw_coeff = 0 ;
j - > img_comp [ i ] . coeff = 0 ;
}
if ( j - > img_comp [ i ] . linebuf ) {
free ( j - > img_comp [ i ] . linebuf ) ;
j - > img_comp [ i ] . linebuf = NULL ;
}
}
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
typedef struct
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
resample_row_func resample ;
uint8_t * line0 , * line1 ;
int hs , vs ; // expansion factor in each axis
int w_lores ; // horizontal pixels pre-expansion
int ystep ; // how far through vertical expansion we are
int ypos ; // which pre-expansion row we're on
} stbi__resample ;
static uint8_t * load_jpeg_image ( stbi__jpeg * z , int * out_x , int * out_y , int * comp , int req_comp )
{
int n , decode_n ;
z - > s - > img_n = 0 ; // make stbi__cleanup_jpeg safe
// validate req_comp
if ( req_comp < 0 | | req_comp > 4 ) return stbi__errpuc ( " bad req_comp " , " Internal error " ) ;
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
// load a jpeg image from whichever source, but leave in YCbCr format
if ( ! stbi__decode_jpeg_image ( z ) ) { stbi__cleanup_jpeg ( z ) ; return NULL ; }
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
// determine actual number of components to generate
n = req_comp ? req_comp : z - > s - > img_n ;
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
if ( z - > s - > img_n = = 3 & & n < 3 )
decode_n = 1 ;
else
decode_n = z - > s - > img_n ;
// resample and color-convert
{
int k ;
unsigned int i , j ;
uint8_t * output ;
uint8_t * coutput [ 4 ] ;
stbi__resample res_comp [ 4 ] ;
for ( k = 0 ; k < decode_n ; + + k ) {
stbi__resample * r = & res_comp [ k ] ;
// allocate line buffer big enough for upsampling off the edges
// with upsample factor of 4
z - > img_comp [ k ] . linebuf = ( uint8_t * ) malloc ( z - > s - > img_x + 3 ) ;
if ( ! z - > img_comp [ k ] . linebuf ) { stbi__cleanup_jpeg ( z ) ; return stbi__errpuc ( " outofmem " , " Out of memory " ) ; }
r - > hs = z - > img_h_max / z - > img_comp [ k ] . h ;
r - > vs = z - > img_v_max / z - > img_comp [ k ] . v ;
r - > ystep = r - > vs > > 1 ;
r - > w_lores = ( z - > s - > img_x + r - > hs - 1 ) / r - > hs ;
r - > ypos = 0 ;
r - > line0 = r - > line1 = z - > img_comp [ k ] . data ;
if ( r - > hs = = 1 & & r - > vs = = 1 ) r - > resample = resample_row_1 ;
else if ( r - > hs = = 1 & & r - > vs = = 2 ) r - > resample = stbi__resample_row_v_2 ;
else if ( r - > hs = = 2 & & r - > vs = = 1 ) r - > resample = stbi__resample_row_h_2 ;
else if ( r - > hs = = 2 & & r - > vs = = 2 ) r - > resample = z - > resample_row_hv_2_kernel ;
else r - > resample = stbi__resample_row_generic ;
}
// can't error after this so, this is safe
output = ( uint8_t * ) malloc ( n * z - > s - > img_x * z - > s - > img_y + 1 ) ;
if ( ! output ) { stbi__cleanup_jpeg ( z ) ; return stbi__errpuc ( " outofmem " , " Out of memory " ) ; }
// now go ahead and resample
for ( j = 0 ; j < z - > s - > img_y ; + + j ) {
uint8_t * out = output + n * z - > s - > img_x * j ;
for ( k = 0 ; k < decode_n ; + + k ) {
stbi__resample * r = & res_comp [ k ] ;
int y_bot = r - > ystep > = ( r - > vs > > 1 ) ;
coutput [ k ] = r - > resample ( z - > img_comp [ k ] . linebuf ,
y_bot ? r - > line1 : r - > line0 ,
y_bot ? r - > line0 : r - > line1 ,
r - > w_lores , r - > hs ) ;
if ( + + r - > ystep > = r - > vs ) {
r - > ystep = 0 ;
r - > line0 = r - > line1 ;
if ( + + r - > ypos < z - > img_comp [ k ] . y )
r - > line1 + = z - > img_comp [ k ] . w2 ;
}
}
if ( n > = 3 ) {
uint8_t * y = coutput [ 0 ] ;
if ( z - > s - > img_n = = 3 ) {
z - > YCbCr_to_RGB_kernel ( out , y , coutput [ 1 ] , coutput [ 2 ] , z - > s - > img_x , n ) ;
} else
for ( i = 0 ; i < z - > s - > img_x ; + + i ) {
out [ 0 ] = out [ 1 ] = out [ 2 ] = y [ i ] ;
out [ 3 ] = 255 ; // not used if n==3
out + = n ;
}
} else {
uint8_t * y = coutput [ 0 ] ;
if ( n = = 1 )
for ( i = 0 ; i < z - > s - > img_x ; + + i ) out [ i ] = y [ i ] ;
else
for ( i = 0 ; i < z - > s - > img_x ; + + i ) * out + + = y [ i ] , * out + + = 255 ;
}
}
stbi__cleanup_jpeg ( z ) ;
* out_x = z - > s - > img_x ;
* out_y = z - > s - > img_y ;
if ( comp ) * comp = z - > s - > img_n ; // report original components, not output
return output ;
}
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
static unsigned char * stbi__jpeg_load ( stbi__context * s , int * x , int * y , int * comp , int req_comp )
2016-05-07 04:33:54 +02:00
{
2016-05-10 05:50:24 +02:00
stbi__jpeg j ;
j . s = s ;
stbi__setup_jpeg ( & j ) ;
return load_jpeg_image ( & j , x , y , comp , req_comp ) ;
}
2016-05-07 04:33:54 +02:00
2016-05-10 05:50:24 +02:00
static int stbi__jpeg_test ( stbi__context * s )
{
int r ;
stbi__jpeg j ;
j . s = s ;
stbi__setup_jpeg ( & j ) ;
r = stbi__decode_jpeg_header ( & j , STBI__SCAN_type ) ;
stbi__rewind ( s ) ;
return r ;
2016-05-07 04:33:54 +02:00
}
2016-05-10 05:50:24 +02:00
# endif
2016-05-07 06:16:02 +02:00
2016-05-10 05:50:24 +02:00
bool rjpeg_image_load ( uint8_t * _buf , void * data , size_t size ,
2016-05-07 16:24:26 +02:00
unsigned a_shift , unsigned r_shift ,
unsigned g_shift , unsigned b_shift )
2016-05-07 06:16:02 +02:00
{
2016-05-10 05:50:24 +02:00
unsigned i ;
int x , y , comp ;
2016-05-07 06:16:02 +02:00
struct texture_image * out_img = ( struct texture_image * ) data ;
2016-05-10 05:50:24 +02:00
out_img - > pixels = stbi_load_from_memory ( _buf , size , & x , & y , & comp , 4 ) ;
2016-05-07 06:49:48 +02:00
2016-05-10 05:50:24 +02:00
out_img - > width = x ;
out_img - > height = y ;
2016-05-07 06:49:48 +02:00
2016-05-10 05:50:24 +02:00
#if 0
for ( i = 0 ; i < ( x * y ) ; i + + )
2016-05-07 06:55:52 +02:00
{
2016-05-10 05:50:24 +02:00
uint32_t r = ( _buf [ i ] & 0xff00ff00 ) ;
uint32_t g = ( ( _buf [ i ] < < 16 ) & 0x00ff0000 ) ;
uint32_t b = ( ( _buf [ i ] > > 16 ) & 0xff ) ;
if ( r_shift = = 0 & & b_shift = = 16 )
out_img - > pixels [ i ] = _buf [ i ] ;
else
out_img - > pixels [ i ] = r | g | b ;
//out_img->pixels[i] = (r << r_shift) | (g << g_shift) || (b << b_shift);
2016-05-07 06:55:52 +02:00
}
2016-05-10 05:50:24 +02:00
# endif
2016-05-07 06:16:02 +02:00
return true ;
}