#include #include #include /* ptrdiff_t on osx */ #include #include #include #include #include #include enum { STBI_default = 0, /* only used for req_comp */ STBI_grey = 1, STBI_grey_alpha = 2, STBI_rgb = 3, STBI_rgb_alpha = 4 }; typedef struct { int (*read) (void *user,char *data,int size); /* fill 'data' with 'size' bytes. return number of bytes actually read */ void (*skip) (void *user,int n); /* skip the next 'n' bytes, or 'unget' the last -n bytes if negative */ int (*eof) (void *user); /* returns nonzero if we are at end of file/data */ } stbi_io_callbacks; /* should produce compiler error if size is wrong */ typedef unsigned char validate_uint32[sizeof(uint32_t)==4 ? 1 : -1]; #ifdef _MSC_VER #define STBI_NOTUSED(v) (void)(v) #else #define STBI_NOTUSED(v) (void)sizeof(v) #endif #ifdef _MSC_VER #define STBI_HAS_LROTL #endif #ifdef STBI_HAS_LROTL #define stbi_lrot(x,y) _lrotl(x,y) #else #define stbi_lrot(x,y) (((x) << (y)) | ((x) >> (32 - (y)))) #endif // x86/x64 detection #if defined(__x86_64__) || defined(_M_X64) #define STBI__X64_TARGET #elif defined(__i386) || defined(_M_IX86) #define STBI__X86_TARGET #endif #if defined(__GNUC__) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET)) && !defined(__SSE2__) && !defined(STBI_NO_SIMD) /* NOTE: not clear do we actually need this for the 64-bit path? * gcc doesn't support sse2 intrinsics unless you compile with -msse2, * (but compiling with -msse2 allows the compiler to use SSE2 everywhere; * this is just broken and gcc are jerks for not fixing it properly * http://www.virtualdub.org/blog/pivot/entry.php?id=363 ) */ #define STBI_NO_SIMD #endif #if defined(__MINGW32__) && defined(STBI__X86_TARGET) && !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD) /* Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid STBI__X64_TARGET * * 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the * Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant. * As a result, enabling SSE2 on 32-bit MinGW is dangerous when not * simultaneously enabling "-mstackrealign". * * See https://github.com/nothings/stb/issues/81 for more information. * * So default to no SSE2 on 32-bit MinGW. If you've read this far and added * -mstackrealign to your build settings, feel free to #define STBI_MINGW_ENABLE_SSE2. */ #define STBI_NO_SIMD #endif #if !defined(STBI_NO_SIMD) && defined(STBI__X86_TARGET) #define STBI_SSE2 #include #ifdef _MSC_VER #if _MSC_VER >= 1400 /* not VC6 */ #include /* __cpuid */ static int stbi__cpuid3(void) { int info[4]; __cpuid(info,1); return info[3]; } #else static int stbi__cpuid3(void) { int res; __asm { mov eax,1 cpuid mov res,edx } return res; } #endif #define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name static int stbi__sse2_available() { int info3 = stbi__cpuid3(); return ((info3 >> 26) & 1) != 0; } #else /* assume GCC-style if not VC++ */ #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16))) static int stbi__sse2_available() { #if defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 /* GCC 4.8 or later */ /* GCC 4.8+ has a nice way to do this */ return __builtin_cpu_supports("sse2"); #else /* portable way to do this, preferably without using GCC inline ASM? * just bail for now. */ return 0; #endif } #endif #endif /* ARM NEON */ #if defined(STBI_NO_SIMD) && defined(STBI_NEON) #undef STBI_NEON #endif #ifdef STBI_NEON #include /* assume GCC or Clang on ARM targets */ #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16))) #endif #ifndef STBI_SIMD_ALIGN #define STBI_SIMD_ALIGN(type, name) type name #endif /////////////////////////////////////////////// // // stbi__context struct and start_xxx functions // stbi__context structure is our basic context used by all images, so it // contains all the IO context, plus some basic image information typedef struct { uint32_t img_x, img_y; int img_n, img_out_n; stbi_io_callbacks io; void *io_user_data; int read_from_callbacks; int buflen; uint8_t buffer_start[128]; uint8_t *img_buffer, *img_buffer_end; uint8_t *img_buffer_original; } stbi__context; static void stbi__refill_buffer(stbi__context *s); // initialize a memory-decode context static void stbi__start_mem(stbi__context *s, const uint8_t *buffer, int len) { s->io.read = NULL; s->read_from_callbacks = 0; s->img_buffer = s->img_buffer_original = (uint8_t *) buffer; s->img_buffer_end = (uint8_t *) buffer+len; } static void stbi__rewind(stbi__context *s) { /* conceptually rewind SHOULD rewind to the beginning of the stream, * but we just rewind to the beginning of the initial buffer, because * we only use it after doing 'test', which only ever looks at at most 92 bytes */ s->img_buffer = s->img_buffer_original; } #ifndef STBI_NO_JPEG static int stbi__jpeg_test(stbi__context *s); static uint8_t *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp); #endif // this is not threadsafe static const char *stbi__g_failure_reason; static int stbi__err(const char *str) { stbi__g_failure_reason = str; return 0; } // stbi__err - error // stbi__errpf - error returning pointer to float // stbi__errpuc - error returning pointer to unsigned char #ifdef STBI_NO_FAILURE_STRINGS #define stbi__err(x,y) 0 #elif defined(STBI_FAILURE_USERMSG) #define stbi__err(x,y) stbi__err(y) #else #define stbi__err(x,y) stbi__err(x) #endif #define stbi__errpf(x,y) ((float *) (stbi__err(x,y)?NULL:NULL)) #define stbi__errpuc(x,y) ((unsigned char *) (stbi__err(x,y)?NULL:NULL)) static int stbi__vertically_flip_on_load = 0; static unsigned char *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp) { #ifndef STBI_NO_JPEG if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp); #endif return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt"); } static unsigned char *stbi__load_flip(stbi__context *s, int *x, int *y, int *comp, int req_comp) { unsigned char *result = stbi__load_main(s, x, y, comp, req_comp); if (stbi__vertically_flip_on_load && result != NULL) { int w = *x, h = *y; int depth = req_comp ? req_comp : *comp; int row,col,z; uint8_t temp; // @OPTIMIZE: use a bigger temp buffer and memcpy multiple pixels at once for (row = 0; row < (h>>1); row++) { for (col = 0; col < w; col++) { for (z = 0; z < depth; z++) { temp = result[(row * w + col) * depth + z]; result[(row * w + col) * depth + z] = result[((h - row - 1) * w + col) * depth + z]; result[((h - row - 1) * w + col) * depth + z] = temp; } } } } return result; } static uint8_t *stbi_load_from_memory(const uint8_t *buffer, int len, int *x, int *y, int *comp, int req_comp) { stbi__context s; stbi__start_mem(&s,buffer,len); return stbi__load_flip(&s,x,y,comp,req_comp); } ////////////////////////////////////////////////////////////////////////////// // // Common code used by all image loaders // enum { STBI__SCAN_load=0, STBI__SCAN_type, STBI__SCAN_header }; static void stbi__refill_buffer(stbi__context *s) { int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen); if (n == 0) { // at end of file, treat same as if from memory, but need to handle case // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file s->read_from_callbacks = 0; s->img_buffer = s->buffer_start; s->img_buffer_end = s->buffer_start+1; *s->img_buffer = 0; } else { s->img_buffer = s->buffer_start; s->img_buffer_end = s->buffer_start + n; } } static INLINE uint8_t stbi__get8(stbi__context *s) { if (s->img_buffer < s->img_buffer_end) return *s->img_buffer++; if (s->read_from_callbacks) { stbi__refill_buffer(s); return *s->img_buffer++; } return 0; } static INLINE int stbi__at_eof(stbi__context *s) { if (s->io.read) { if (!(s->io.eof)(s->io_user_data)) return 0; // if feof() is true, check if buffer = end // special case: we've only got the special 0 character at the end if (s->read_from_callbacks == 0) return 1; } return s->img_buffer >= s->img_buffer_end; } static void stbi__skip(stbi__context *s, int n) { if (n < 0) { s->img_buffer = s->img_buffer_end; return; } if (s->io.read) { int blen = (int) (s->img_buffer_end - s->img_buffer); if (blen < n) { s->img_buffer = s->img_buffer_end; (s->io.skip)(s->io_user_data, n - blen); return; } } s->img_buffer += n; } static int stbi__get16be(stbi__context *s) { int z = stbi__get8(s); return (z << 8) + stbi__get8(s); } #define STBI__BYTECAST(x) ((uint8_t) ((x) & 255)) // truncate int to byte without warnings ////////////////////////////////////////////////////////////////////////////// // // "baseline" JPEG/JFIF decoder // // simple implementation // - doesn't support delayed output of y-dimension // - simple interface (only one output format: 8-bit interleaved RGB) // - doesn't try to recover corrupt jpegs // - doesn't allow partial loading, loading multiple at once // - still fast on x86 (copying globals into locals doesn't help x86) // - allocates lots of intermediate memory (full size of all components) // - non-interleaved case requires this anyway // - allows good upsampling (see next) // high-quality // - upsampled channels are bilinearly interpolated, even across blocks // - quality integer IDCT derived from IJG's 'slow' // performance // - fast huffman; reasonable integer IDCT // - some SIMD kernels for common paths on targets with SSE2/NEON // - uses a lot of intermediate memory, could cache poorly #ifndef STBI_NO_JPEG // huffman decoding acceleration #define FAST_BITS 9 // larger handles more cases; smaller stomps less cache typedef struct { uint8_t fast[1 << FAST_BITS]; // weirdly, repacking this into AoS is a 10% speed loss, instead of a win uint16_t code[256]; uint8_t values[256]; uint8_t size[257]; unsigned int maxcode[18]; int delta[17]; // old 'firstsymbol' - old 'firstcode' } stbi__huffman; typedef struct { stbi__context *s; stbi__huffman huff_dc[4]; stbi__huffman huff_ac[4]; uint8_t dequant[4][64]; int16_t fast_ac[4][1 << FAST_BITS]; // sizes for components, interleaved MCUs int img_h_max, img_v_max; int img_mcu_x, img_mcu_y; int img_mcu_w, img_mcu_h; // definition of jpeg image component struct { int id; int h,v; int tq; int hd,ha; int dc_pred; int x,y,w2,h2; uint8_t *data; void *raw_data, *raw_coeff; uint8_t *linebuf; short *coeff; // progressive only int coeff_w, coeff_h; // number of 8x8 coefficient blocks } img_comp[4]; uint32_t code_buffer; // jpeg entropy-coded buffer int code_bits; // number of valid bits unsigned char marker; // marker seen while filling entropy buffer int nomore; // flag if we saw a marker so must stop int progressive; int spec_start; int spec_end; int succ_high; int succ_low; int eob_run; int scan_n, order[4]; int restart_interval, todo; // kernels void (*idct_block_kernel)(uint8_t *out, int out_stride, short data[64]); void (*YCbCr_to_RGB_kernel)(uint8_t *out, const uint8_t *y, const uint8_t *pcb, const uint8_t *pcr, int count, int step); uint8_t *(*resample_row_hv_2_kernel)(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs); } stbi__jpeg; static int stbi__build_huffman(stbi__huffman *h, int *count) { int i,j,k=0,code; // build size list for each symbol (from JPEG spec) for (i=0; i < 16; ++i) for (j=0; j < count[i]; ++j) h->size[k++] = (uint8_t) (i+1); h->size[k] = 0; // compute actual symbols (from jpeg spec) code = 0; k = 0; for(j=1; j <= 16; ++j) { // compute delta to add to code to compute symbol id h->delta[j] = k - code; if (h->size[k] == j) { while (h->size[k] == j) h->code[k++] = (uint16_t) (code++); if (code-1 >= (1 << j)) return stbi__err("bad code lengths","Corrupt JPEG"); } // compute largest code + 1 for this size, preshifted as needed later h->maxcode[j] = code << (16-j); code <<= 1; } h->maxcode[j] = 0xffffffff; // build non-spec acceleration table; 255 is flag for not-accelerated memset(h->fast, 255, 1 << FAST_BITS); for (i=0; i < k; ++i) { int s = h->size[i]; if (s <= FAST_BITS) { int c = h->code[i] << (FAST_BITS-s); int m = 1 << (FAST_BITS-s); for (j=0; j < m; ++j) { h->fast[c+j] = (uint8_t) i; } } } return 1; } // build a table that decodes both magnitude and value of small ACs in // one go. static void stbi__build_fast_ac(int16_t *fast_ac, stbi__huffman *h) { int i; for (i=0; i < (1 << FAST_BITS); ++i) { uint8_t fast = h->fast[i]; fast_ac[i] = 0; if (fast < 255) { int rs = h->values[fast]; int run = (rs >> 4) & 15; int magbits = rs & 15; int len = h->size[fast]; if (magbits && len + magbits <= FAST_BITS) { // magnitude code followed by receive_extend code int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits); int m = 1 << (magbits - 1); if (k < m) k += (-1 << magbits) + 1; // if the result is small enough, we can fit it in fast_ac table if (k >= -128 && k <= 127) fast_ac[i] = (int16_t) ((k << 8) + (run << 4) + (len + magbits)); } } } } static void stbi__grow_buffer_unsafe(stbi__jpeg *j) { do { int b = j->nomore ? 0 : stbi__get8(j->s); if (b == 0xff) { int c = stbi__get8(j->s); if (c != 0) { j->marker = (unsigned char) c; j->nomore = 1; return; } } j->code_buffer |= b << (24 - j->code_bits); j->code_bits += 8; } while (j->code_bits <= 24); } // (1 << n) - 1 static uint32_t stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535}; // decode a jpeg huffman value from the bitstream static INLINE int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h) { unsigned int temp; int c,k; if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); // look at the top FAST_BITS and determine what symbol ID it is, // if the code is <= FAST_BITS c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1); k = h->fast[c]; if (k < 255) { int s = h->size[k]; if (s > j->code_bits) return -1; j->code_buffer <<= s; j->code_bits -= s; return h->values[k]; } // naive test is to shift the code_buffer down so k bits are // valid, then test against maxcode. To speed this up, we've // preshifted maxcode left so that it has (16-k) 0s at the // end; in other words, regardless of the number of bits, it // wants to be compared against something shifted to have 16; // that way we don't need to shift inside the loop. temp = j->code_buffer >> 16; for (k=FAST_BITS+1 ; ; ++k) if (temp < h->maxcode[k]) break; if (k == 17) { // error! code not found j->code_bits -= 16; return -1; } if (k > j->code_bits) return -1; // convert the huffman code to the symbol id c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k]; assert((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]); // convert the id to a symbol j->code_bits -= k; j->code_buffer <<= k; return h->values[c]; } // bias[n] = (-1<code_bits < n) stbi__grow_buffer_unsafe(j); sgn = (int32_t)j->code_buffer >> 31; // sign bit is always in MSB k = stbi_lrot(j->code_buffer, n); assert(n >= 0 && n < (int) (sizeof(stbi__bmask)/sizeof(*stbi__bmask))); j->code_buffer = k & ~stbi__bmask[n]; k &= stbi__bmask[n]; j->code_bits -= n; return k + (stbi__jbias[n] & ~sgn); } // get some unsigned bits static INLINE int stbi__jpeg_get_bits(stbi__jpeg *j, int n) { unsigned int k; if (j->code_bits < n) stbi__grow_buffer_unsafe(j); k = stbi_lrot(j->code_buffer, n); j->code_buffer = k & ~stbi__bmask[n]; k &= stbi__bmask[n]; j->code_bits -= n; return k; } static INLINE int stbi__jpeg_get_bit(stbi__jpeg *j) { unsigned int k; if (j->code_bits < 1) stbi__grow_buffer_unsafe(j); k = j->code_buffer; j->code_buffer <<= 1; --j->code_bits; return k & 0x80000000; } // given a value that's at position X in the zigzag stream, // where does it appear in the 8x8 matrix coded as row-major? static uint8_t stbi__jpeg_dezigzag[64+15] = { 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63, // let corrupt input sample past end 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 }; // decode one 64-entry block-- static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, int16_t *fac, int b, uint8_t *dequant) { int diff,dc,k; int t; if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); t = stbi__jpeg_huff_decode(j, hdc); if (t < 0) return stbi__err("bad huffman code","Corrupt JPEG"); // 0 all the ac values now so we can do it 32-bits at a time memset(data,0,64*sizeof(data[0])); diff = t ? stbi__extend_receive(j, t) : 0; dc = j->img_comp[b].dc_pred + diff; j->img_comp[b].dc_pred = dc; data[0] = (short) (dc * dequant[0]); // decode AC components, see JPEG spec k = 1; do { unsigned int zig; int c,r,s; if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1); r = fac[c]; if (r) { // fast-AC path k += (r >> 4) & 15; // run s = r & 15; // combined length j->code_buffer <<= s; j->code_bits -= s; // decode into unzigzag'd location zig = stbi__jpeg_dezigzag[k++]; data[zig] = (short) ((r >> 8) * dequant[zig]); } else { int rs = stbi__jpeg_huff_decode(j, hac); if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG"); s = rs & 15; r = rs >> 4; if (s == 0) { if (rs != 0xf0) break; // end block k += 16; } else { k += r; // decode into unzigzag'd location zig = stbi__jpeg_dezigzag[k++]; data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]); } } } while (k < 64); return 1; } static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b) { if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG"); if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); if (j->succ_high == 0) { int diff,dc; int t; /* first scan for DC coefficient, must be first */ memset(data,0,64*sizeof(data[0])); // 0 all the ac values now t = stbi__jpeg_huff_decode(j, hdc); diff = t ? stbi__extend_receive(j, t) : 0; dc = j->img_comp[b].dc_pred + diff; j->img_comp[b].dc_pred = dc; data[0] = (short) (dc << j->succ_low); } else { /* refinement scan for DC coefficient */ if (stbi__jpeg_get_bit(j)) data[0] += (short) (1 << j->succ_low); } return 1; } // @OPTIMIZE: store non-zigzagged during the decode passes, // and only de-zigzag when dequantizing static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, int16_t *fac) { int k; if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG"); if (j->succ_high == 0) { int shift = j->succ_low; if (j->eob_run) { --j->eob_run; return 1; } k = j->spec_start; do { unsigned int zig; int c,r,s; if (j->code_bits < 16) stbi__grow_buffer_unsafe(j); c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1); r = fac[c]; if (r) { // fast-AC path k += (r >> 4) & 15; // run s = r & 15; // combined length j->code_buffer <<= s; j->code_bits -= s; zig = stbi__jpeg_dezigzag[k++]; data[zig] = (short) ((r >> 8) << shift); } else { int rs = stbi__jpeg_huff_decode(j, hac); if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG"); s = rs & 15; r = rs >> 4; if (s == 0) { if (r < 15) { j->eob_run = (1 << r); if (r) j->eob_run += stbi__jpeg_get_bits(j, r); --j->eob_run; break; } k += 16; } else { k += r; zig = stbi__jpeg_dezigzag[k++]; data[zig] = (short) (stbi__extend_receive(j,s) << shift); } } } while (k <= j->spec_end); } else { // refinement scan for these AC coefficients short bit = (short) (1 << j->succ_low); if (j->eob_run) { --j->eob_run; for (k = j->spec_start; k <= j->spec_end; ++k) { short *p = &data[stbi__jpeg_dezigzag[k]]; if (*p != 0) if (stbi__jpeg_get_bit(j)) if ((*p & bit)==0) { if (*p > 0) *p += bit; else *p -= bit; } } } else { k = j->spec_start; do { int r,s; int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG"); s = rs & 15; r = rs >> 4; if (s == 0) { if (r < 15) { j->eob_run = (1 << r) - 1; if (r) j->eob_run += stbi__jpeg_get_bits(j, r); r = 64; // force end of block } else { // r=15 s=0 should write 16 0s, so we just do // a run of 15 0s and then write s (which is 0), // so we don't have to do anything special here } } else { if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG"); // sign bit if (stbi__jpeg_get_bit(j)) s = bit; else s = -bit; } // advance by r while (k <= j->spec_end) { short *p = &data[stbi__jpeg_dezigzag[k++]]; if (*p != 0) { if (stbi__jpeg_get_bit(j)) if ((*p & bit)==0) { if (*p > 0) *p += bit; else *p -= bit; } } else { if (r == 0) { *p = (short) s; break; } --r; } } } while (k <= j->spec_end); } } return 1; } // take a -128..127 value and stbi__clamp it and convert to 0..255 static INLINE uint8_t stbi__clamp(int x) { // trick to use a single test to catch both cases if ((unsigned int) x > 255) { if (x < 0) return 0; if (x > 255) return 255; } return (uint8_t) x; } #define stbi__f2f(x) ((int) (((x) * 4096 + 0.5))) #define stbi__fsh(x) ((x) << 12) // derived from jidctint -- DCT_ISLOW #define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \ int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \ p2 = s2; \ p3 = s6; \ p1 = (p2+p3) * stbi__f2f(0.5411961f); \ t2 = p1 + p3*stbi__f2f(-1.847759065f); \ t3 = p1 + p2*stbi__f2f( 0.765366865f); \ p2 = s0; \ p3 = s4; \ t0 = stbi__fsh(p2+p3); \ t1 = stbi__fsh(p2-p3); \ x0 = t0+t3; \ x3 = t0-t3; \ x1 = t1+t2; \ x2 = t1-t2; \ t0 = s7; \ t1 = s5; \ t2 = s3; \ t3 = s1; \ p3 = t0+t2; \ p4 = t1+t3; \ p1 = t0+t3; \ p2 = t1+t2; \ p5 = (p3+p4)*stbi__f2f( 1.175875602f); \ t0 = t0*stbi__f2f( 0.298631336f); \ t1 = t1*stbi__f2f( 2.053119869f); \ t2 = t2*stbi__f2f( 3.072711026f); \ t3 = t3*stbi__f2f( 1.501321110f); \ p1 = p5 + p1*stbi__f2f(-0.899976223f); \ p2 = p5 + p2*stbi__f2f(-2.562915447f); \ p3 = p3*stbi__f2f(-1.961570560f); \ p4 = p4*stbi__f2f(-0.390180644f); \ t3 += p1+p4; \ t2 += p2+p3; \ t1 += p2+p4; \ t0 += p1+p3; static void stbi__idct_block(uint8_t *out, int out_stride, short data[64]) { int i,val[64],*v=val; uint8_t *o; short *d = data; // columns for (i=0; i < 8; ++i,++d, ++v) { // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0 && d[40]==0 && d[48]==0 && d[56]==0) { // no shortcut 0 seconds // (1|2|3|4|5|6|7)==0 0 seconds // all separate -0.047 seconds // 1 && 2|3 && 4|5 && 6|7: -0.047 seconds int dcterm = d[0] << 2; v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm; } else { STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56]) // constants scaled things up by 1<<12; let's bring them back // down, but keep 2 extra bits of precision x0 += 512; x1 += 512; x2 += 512; x3 += 512; v[ 0] = (x0+t3) >> 10; v[56] = (x0-t3) >> 10; v[ 8] = (x1+t2) >> 10; v[48] = (x1-t2) >> 10; v[16] = (x2+t1) >> 10; v[40] = (x2-t1) >> 10; v[24] = (x3+t0) >> 10; v[32] = (x3-t0) >> 10; } } for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) { // no fast case since the first 1D IDCT spread components out STBI__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7]) // constants scaled things up by 1<<12, plus we had 1<<2 from first // loop, plus horizontal and vertical each scale by sqrt(8) so together // we've got an extra 1<<3, so 1<<17 total we need to remove. // so we want to round that, which means adding 0.5 * 1<<17, // aka 65536. Also, we'll end up with -128 to 127 that we want // to encode as 0..255 by adding 128, so we'll add that before the shift x0 += 65536 + (128<<17); x1 += 65536 + (128<<17); x2 += 65536 + (128<<17); x3 += 65536 + (128<<17); // tried computing the shifts into temps, or'ing the temps to see // if any were out of range, but that was slower o[0] = stbi__clamp((x0+t3) >> 17); o[7] = stbi__clamp((x0-t3) >> 17); o[1] = stbi__clamp((x1+t2) >> 17); o[6] = stbi__clamp((x1-t2) >> 17); o[2] = stbi__clamp((x2+t1) >> 17); o[5] = stbi__clamp((x2-t1) >> 17); o[3] = stbi__clamp((x3+t0) >> 17); o[4] = stbi__clamp((x3-t0) >> 17); } } #ifdef STBI_SSE2 /* sse2 integer IDCT. not the fastest possible implementation but it * produces bit-identical results to the generic C version so it's * fully "transparent". */ static void stbi__idct_simd(uint8_t *out, int out_stride, short data[64]) { /* This is constructed to match our regular (generic) integer IDCT exactly. */ __m128i row0, row1, row2, row3, row4, row5, row6, row7; __m128i tmp; /* dot product constant: even elems=x, odd elems=y */ #define dct_const(x,y) _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y)) /* out(0) = c0[even]*x + c0[odd]*y (c0, x, y 16-bit, out 32-bit) * out(1) = c1[even]*x + c1[odd]*y */ #define dct_rot(out0,out1, x,y,c0,c1) \ __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \ __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \ __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \ __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \ __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \ __m128i out1##_h = _mm_madd_epi16(c0##hi, c1) /* out = in << 12 (in 16-bit, out 32-bit) */ #define dct_widen(out, in) \ __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \ __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4) /* wide add */ #define dct_wadd(out, a, b) \ __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \ __m128i out##_h = _mm_add_epi32(a##_h, b##_h) /* wide sub */ #define dct_wsub(out, a, b) \ __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \ __m128i out##_h = _mm_sub_epi32(a##_h, b##_h) /* butterfly a/b, add bias, then shift by "s" and pack */ #define dct_bfly32o(out0, out1, a,b,bias,s) \ { \ __m128i abiased_l = _mm_add_epi32(a##_l, bias); \ __m128i abiased_h = _mm_add_epi32(a##_h, bias); \ dct_wadd(sum, abiased, b); \ dct_wsub(dif, abiased, b); \ out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \ out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \ } /* 8-bit interleave step (for transposes) */ #define dct_interleave8(a, b) \ tmp = a; \ a = _mm_unpacklo_epi8(a, b); \ b = _mm_unpackhi_epi8(tmp, b) /* 16-bit interleave step (for transposes) */ #define dct_interleave16(a, b) \ tmp = a; \ a = _mm_unpacklo_epi16(a, b); \ b = _mm_unpackhi_epi16(tmp, b) #define dct_pass(bias,shift) \ { \ /* even part */ \ dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \ __m128i sum04 = _mm_add_epi16(row0, row4); \ __m128i dif04 = _mm_sub_epi16(row0, row4); \ dct_widen(t0e, sum04); \ dct_widen(t1e, dif04); \ dct_wadd(x0, t0e, t3e); \ dct_wsub(x3, t0e, t3e); \ dct_wadd(x1, t1e, t2e); \ dct_wsub(x2, t1e, t2e); \ /* odd part */ \ dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \ dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \ __m128i sum17 = _mm_add_epi16(row1, row7); \ __m128i sum35 = _mm_add_epi16(row3, row5); \ dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \ dct_wadd(x4, y0o, y4o); \ dct_wadd(x5, y1o, y5o); \ dct_wadd(x6, y2o, y5o); \ dct_wadd(x7, y3o, y4o); \ dct_bfly32o(row0,row7, x0,x7,bias,shift); \ dct_bfly32o(row1,row6, x1,x6,bias,shift); \ dct_bfly32o(row2,row5, x2,x5,bias,shift); \ dct_bfly32o(row3,row4, x3,x4,bias,shift); \ } __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f)); __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f)); __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f)); __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f)); __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f)); __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f)); __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f)); __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f)); /* rounding biases in column/row passes, see stbi__idct_block for explanation. */ __m128i bias_0 = _mm_set1_epi32(512); __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17)); /* load */ row0 = _mm_load_si128((const __m128i *) (data + 0*8)); row1 = _mm_load_si128((const __m128i *) (data + 1*8)); row2 = _mm_load_si128((const __m128i *) (data + 2*8)); row3 = _mm_load_si128((const __m128i *) (data + 3*8)); row4 = _mm_load_si128((const __m128i *) (data + 4*8)); row5 = _mm_load_si128((const __m128i *) (data + 5*8)); row6 = _mm_load_si128((const __m128i *) (data + 6*8)); row7 = _mm_load_si128((const __m128i *) (data + 7*8)); /* column pass */ dct_pass(bias_0, 10); { /* 16bit 8x8 transpose pass 1 */ dct_interleave16(row0, row4); dct_interleave16(row1, row5); dct_interleave16(row2, row6); dct_interleave16(row3, row7); /* transpose pass 2 */ dct_interleave16(row0, row2); dct_interleave16(row1, row3); dct_interleave16(row4, row6); dct_interleave16(row5, row7); /* transpose pass 3 */ dct_interleave16(row0, row1); dct_interleave16(row2, row3); dct_interleave16(row4, row5); dct_interleave16(row6, row7); } /* row pass */ dct_pass(bias_1, 17); { /* pack */ __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7 __m128i p1 = _mm_packus_epi16(row2, row3); __m128i p2 = _mm_packus_epi16(row4, row5); __m128i p3 = _mm_packus_epi16(row6, row7); // 8bit 8x8 transpose pass 1 dct_interleave8(p0, p2); // a0e0a1e1... dct_interleave8(p1, p3); // c0g0c1g1... // transpose pass 2 dct_interleave8(p0, p1); // a0c0e0g0... dct_interleave8(p2, p3); // b0d0f0h0... // transpose pass 3 dct_interleave8(p0, p2); // a0b0c0d0... dct_interleave8(p1, p3); // a4b4c4d4... // store _mm_storel_epi64((__m128i *) out, p0); out += out_stride; _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride; _mm_storel_epi64((__m128i *) out, p2); out += out_stride; _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride; _mm_storel_epi64((__m128i *) out, p1); out += out_stride; _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride; _mm_storel_epi64((__m128i *) out, p3); out += out_stride; _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e)); } #undef dct_const #undef dct_rot #undef dct_widen #undef dct_wadd #undef dct_wsub #undef dct_bfly32o #undef dct_interleave8 #undef dct_interleave16 #undef dct_pass } #endif /* STBI_SSE2 */ #ifdef STBI_NEON /* NEON integer IDCT. should produce bit-identical * results to the generic C version. */ static void stbi__idct_simd(uint8_t *out, int out_stride, short data[64]) { int16x8_t row0, row1, row2, row3, row4, row5, row6, row7; int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f)); int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f)); int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f)); int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f)); int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f)); int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f)); int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f)); int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f)); int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f)); int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f)); int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f)); int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f)); #define dct_long_mul(out, inq, coeff) \ int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \ int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff) #define dct_long_mac(out, acc, inq, coeff) \ int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \ int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff) #define dct_widen(out, inq) \ int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \ int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12) /* wide add */ #define dct_wadd(out, a, b) \ int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \ int32x4_t out##_h = vaddq_s32(a##_h, b##_h) /* wide sub */ #define dct_wsub(out, a, b) \ int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \ int32x4_t out##_h = vsubq_s32(a##_h, b##_h) // butterfly a/b, then shift using "shiftop" by "s" and pack #define dct_bfly32o(out0,out1, a,b,shiftop,s) \ { \ dct_wadd(sum, a, b); \ dct_wsub(dif, a, b); \ out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \ out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \ } #define dct_pass(shiftop, shift) \ { \ /* even part */ \ int16x8_t sum26 = vaddq_s16(row2, row6); \ dct_long_mul(p1e, sum26, rot0_0); \ dct_long_mac(t2e, p1e, row6, rot0_1); \ dct_long_mac(t3e, p1e, row2, rot0_2); \ int16x8_t sum04 = vaddq_s16(row0, row4); \ int16x8_t dif04 = vsubq_s16(row0, row4); \ dct_widen(t0e, sum04); \ dct_widen(t1e, dif04); \ dct_wadd(x0, t0e, t3e); \ dct_wsub(x3, t0e, t3e); \ dct_wadd(x1, t1e, t2e); \ dct_wsub(x2, t1e, t2e); \ /* odd part */ \ int16x8_t sum15 = vaddq_s16(row1, row5); \ int16x8_t sum17 = vaddq_s16(row1, row7); \ int16x8_t sum35 = vaddq_s16(row3, row5); \ int16x8_t sum37 = vaddq_s16(row3, row7); \ int16x8_t sumodd = vaddq_s16(sum17, sum35); \ dct_long_mul(p5o, sumodd, rot1_0); \ dct_long_mac(p1o, p5o, sum17, rot1_1); \ dct_long_mac(p2o, p5o, sum35, rot1_2); \ dct_long_mul(p3o, sum37, rot2_0); \ dct_long_mul(p4o, sum15, rot2_1); \ dct_wadd(sump13o, p1o, p3o); \ dct_wadd(sump24o, p2o, p4o); \ dct_wadd(sump23o, p2o, p3o); \ dct_wadd(sump14o, p1o, p4o); \ dct_long_mac(x4, sump13o, row7, rot3_0); \ dct_long_mac(x5, sump24o, row5, rot3_1); \ dct_long_mac(x6, sump23o, row3, rot3_2); \ dct_long_mac(x7, sump14o, row1, rot3_3); \ dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \ dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \ dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \ dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \ } // load row0 = vld1q_s16(data + 0*8); row1 = vld1q_s16(data + 1*8); row2 = vld1q_s16(data + 2*8); row3 = vld1q_s16(data + 3*8); row4 = vld1q_s16(data + 4*8); row5 = vld1q_s16(data + 5*8); row6 = vld1q_s16(data + 6*8); row7 = vld1q_s16(data + 7*8); // add DC bias row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0)); // column pass dct_pass(vrshrn_n_s32, 10); // 16bit 8x8 transpose { // these three map to a single VTRN.16, VTRN.32, and VSWP, respectively. // whether compilers actually get this is another story, sadly. #define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; } #define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); } #define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); } // pass 1 dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6 dct_trn16(row2, row3); dct_trn16(row4, row5); dct_trn16(row6, row7); // pass 2 dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4 dct_trn32(row1, row3); dct_trn32(row4, row6); dct_trn32(row5, row7); // pass 3 dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0 dct_trn64(row1, row5); dct_trn64(row2, row6); dct_trn64(row3, row7); #undef dct_trn16 #undef dct_trn32 #undef dct_trn64 } // row pass // vrshrn_n_s32 only supports shifts up to 16, we need // 17. so do a non-rounding shift of 16 first then follow // up with a rounding shift by 1. dct_pass(vshrn_n_s32, 16); { /* pack and round */ uint8x8_t p0 = vqrshrun_n_s16(row0, 1); uint8x8_t p1 = vqrshrun_n_s16(row1, 1); uint8x8_t p2 = vqrshrun_n_s16(row2, 1); uint8x8_t p3 = vqrshrun_n_s16(row3, 1); uint8x8_t p4 = vqrshrun_n_s16(row4, 1); uint8x8_t p5 = vqrshrun_n_s16(row5, 1); uint8x8_t p6 = vqrshrun_n_s16(row6, 1); uint8x8_t p7 = vqrshrun_n_s16(row7, 1); /* again, these can translate into one instruction, but often don't. */ #define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; } #define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); } #define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); } /* sadly can't use interleaved stores here since we only write * 8 bytes to each scan line! */ /* 8x8 8-bit transpose pass 1 */ dct_trn8_8(p0, p1); dct_trn8_8(p2, p3); dct_trn8_8(p4, p5); dct_trn8_8(p6, p7); /* pass 2 */ dct_trn8_16(p0, p2); dct_trn8_16(p1, p3); dct_trn8_16(p4, p6); dct_trn8_16(p5, p7); /* pass 3 */ dct_trn8_32(p0, p4); dct_trn8_32(p1, p5); dct_trn8_32(p2, p6); dct_trn8_32(p3, p7); /* store */ vst1_u8(out, p0); out += out_stride; vst1_u8(out, p1); out += out_stride; vst1_u8(out, p2); out += out_stride; vst1_u8(out, p3); out += out_stride; vst1_u8(out, p4); out += out_stride; vst1_u8(out, p5); out += out_stride; vst1_u8(out, p6); out += out_stride; vst1_u8(out, p7); #undef dct_trn8_8 #undef dct_trn8_16 #undef dct_trn8_32 } #undef dct_long_mul #undef dct_long_mac #undef dct_widen #undef dct_wadd #undef dct_wsub #undef dct_bfly32o #undef dct_pass } #endif /* STBI_NEON */ #define STBI__MARKER_none 0xff /* if there's a pending marker from the entropy stream, return that * otherwise, fetch from the stream and get a marker. if there's no * marker, return 0xff, which is never a valid marker value */ static uint8_t stbi__get_marker(stbi__jpeg *j) { uint8_t x; if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; } x = stbi__get8(j->s); if (x != 0xff) return STBI__MARKER_none; while (x == 0xff) x = stbi__get8(j->s); return x; } /* in each scan, we'll have scan_n components, and the order * of the components is specified by order[] */ #define STBI__RESTART(x) ((x) >= 0xd0 && (x) <= 0xd7) /* after a restart interval, stbi__jpeg_reset the entropy decoder and * the dc prediction */ static void stbi__jpeg_reset(stbi__jpeg *j) { j->code_bits = 0; j->code_buffer = 0; j->nomore = 0; j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = 0; j->marker = STBI__MARKER_none; j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff; j->eob_run = 0; // no more than 1<<31 MCUs if no restart_interal? that's plenty safe, // since we don't even allow 1<<30 pixels } static int stbi__parse_entropy_coded_data(stbi__jpeg *z) { stbi__jpeg_reset(z); if (!z->progressive) { if (z->scan_n == 1) { int i,j; STBI_SIMD_ALIGN(short, data[64]); int n = z->order[0]; // non-interleaved data, we just need to process one block at a time, // in trivial scanline order // number of blocks to do just depends on how many actual "pixels" this // component has, independent of interleaved MCU blocking and such int w = (z->img_comp[n].x+7) >> 3; int h = (z->img_comp[n].y+7) >> 3; for (j=0; j < h; ++j) { for (i=0; i < w; ++i) { int ha = z->img_comp[n].ha; if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0; z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data); // every data block is an MCU, so countdown the restart interval if (--z->todo <= 0) { if (z->code_bits < 24) stbi__grow_buffer_unsafe(z); // if it's NOT a restart, then just bail, so we get corrupt data // rather than no data if (!STBI__RESTART(z->marker)) return 1; stbi__jpeg_reset(z); } } } return 1; } else { // interleaved int i,j,k,x,y; STBI_SIMD_ALIGN(short, data[64]); for (j=0; j < z->img_mcu_y; ++j) { for (i=0; i < z->img_mcu_x; ++i) { // scan an interleaved mcu... process scan_n components in order for (k=0; k < z->scan_n; ++k) { int n = z->order[k]; // scan out an mcu's worth of this component; that's just determined // by the basic H and V specified for the component for (y=0; y < z->img_comp[n].v; ++y) { for (x=0; x < z->img_comp[n].h; ++x) { int x2 = (i*z->img_comp[n].h + x)*8; int y2 = (j*z->img_comp[n].v + y)*8; int ha = z->img_comp[n].ha; if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0; z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data); } } } // after all interleaved components, that's an interleaved MCU, // so now count down the restart interval if (--z->todo <= 0) { if (z->code_bits < 24) stbi__grow_buffer_unsafe(z); if (!STBI__RESTART(z->marker)) return 1; stbi__jpeg_reset(z); } } } return 1; } } else { if (z->scan_n == 1) { int i,j; int n = z->order[0]; // non-interleaved data, we just need to process one block at a time, // in trivial scanline order // number of blocks to do just depends on how many actual "pixels" this // component has, independent of interleaved MCU blocking and such int w = (z->img_comp[n].x+7) >> 3; int h = (z->img_comp[n].y+7) >> 3; for (j=0; j < h; ++j) { for (i=0; i < w; ++i) { short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w); if (z->spec_start == 0) { if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n)) return 0; } else { int ha = z->img_comp[n].ha; if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha])) return 0; } // every data block is an MCU, so countdown the restart interval if (--z->todo <= 0) { if (z->code_bits < 24) stbi__grow_buffer_unsafe(z); if (!STBI__RESTART(z->marker)) return 1; stbi__jpeg_reset(z); } } } return 1; } else { // interleaved int i,j,k,x,y; for (j=0; j < z->img_mcu_y; ++j) { for (i=0; i < z->img_mcu_x; ++i) { // scan an interleaved mcu... process scan_n components in order for (k=0; k < z->scan_n; ++k) { int n = z->order[k]; // scan out an mcu's worth of this component; that's just determined // by the basic H and V specified for the component for (y=0; y < z->img_comp[n].v; ++y) { for (x=0; x < z->img_comp[n].h; ++x) { int x2 = (i*z->img_comp[n].h + x); int y2 = (j*z->img_comp[n].v + y); short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w); if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n)) return 0; } } } // after all interleaved components, that's an interleaved MCU, // so now count down the restart interval if (--z->todo <= 0) { if (z->code_bits < 24) stbi__grow_buffer_unsafe(z); if (!STBI__RESTART(z->marker)) return 1; stbi__jpeg_reset(z); } } } return 1; } } } static void stbi__jpeg_dequantize(short *data, uint8_t *dequant) { int i; for (i=0; i < 64; ++i) data[i] *= dequant[i]; } static void stbi__jpeg_finish(stbi__jpeg *z) { if (z->progressive) { // dequantize and idct the data int i,j,n; for (n=0; n < z->s->img_n; ++n) { int w = (z->img_comp[n].x+7) >> 3; int h = (z->img_comp[n].y+7) >> 3; for (j=0; j < h; ++j) { for (i=0; i < w; ++i) { short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w); stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]); z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data); } } } } } static int stbi__process_marker(stbi__jpeg *z, int m) { int L; switch (m) { case STBI__MARKER_none: // no marker found return stbi__err("expected marker","Corrupt JPEG"); case 0xDD: // DRI - specify restart interval if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG"); z->restart_interval = stbi__get16be(z->s); return 1; case 0xDB: // DQT - define quantization table L = stbi__get16be(z->s)-2; while (L > 0) { int q = stbi__get8(z->s); int p = q >> 4; int t = q & 15,i; if (p != 0) return stbi__err("bad DQT type","Corrupt JPEG"); if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG"); for (i=0; i < 64; ++i) z->dequant[t][stbi__jpeg_dezigzag[i]] = stbi__get8(z->s); L -= 65; } return L==0; case 0xC4: // DHT - define huffman table L = stbi__get16be(z->s)-2; while (L > 0) { uint8_t *v; int sizes[16],i,n=0; int q = stbi__get8(z->s); int tc = q >> 4; int th = q & 15; if (tc > 1 || th > 3) return stbi__err("bad DHT header","Corrupt JPEG"); for (i=0; i < 16; ++i) { sizes[i] = stbi__get8(z->s); n += sizes[i]; } L -= 17; if (tc == 0) { if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0; v = z->huff_dc[th].values; } else { if (!stbi__build_huffman(z->huff_ac+th, sizes)) return 0; v = z->huff_ac[th].values; } for (i=0; i < n; ++i) v[i] = stbi__get8(z->s); if (tc != 0) stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th); L -= n; } return L==0; } // check for comment block or APP blocks if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) { stbi__skip(z->s, stbi__get16be(z->s)-2); return 1; } return 0; } // after we see SOS static int stbi__process_scan_header(stbi__jpeg *z) { int i; int Ls = stbi__get16be(z->s); z->scan_n = stbi__get8(z->s); if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad SOS component count","Corrupt JPEG"); if (Ls != 6+2*z->scan_n) return stbi__err("bad SOS len","Corrupt JPEG"); for (i=0; i < z->scan_n; ++i) { int id = stbi__get8(z->s), which; int q = stbi__get8(z->s); for (which = 0; which < z->s->img_n; ++which) if (z->img_comp[which].id == id) break; if (which == z->s->img_n) return 0; /* no match */ z->img_comp[which].hd = q >> 4; if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff","Corrupt JPEG"); z->img_comp[which].ha = q & 15; if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff","Corrupt JPEG"); z->order[i] = which; } { int aa; z->spec_start = stbi__get8(z->s); z->spec_end = stbi__get8(z->s); /* should be 63, but might be 0 */ aa = stbi__get8(z->s); z->succ_high = (aa >> 4); z->succ_low = (aa & 15); if (z->progressive) { if (z->spec_start > 63 || z->spec_end > 63 || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13) return stbi__err("bad SOS", "Corrupt JPEG"); } else { if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG"); if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG"); z->spec_end = 63; } } return 1; } static int stbi__process_frame_header(stbi__jpeg *z, int scan) { stbi__context *s = z->s; int Lf,p,i,q, h_max=1,v_max=1,c; Lf = stbi__get16be(s); if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG p = stbi__get8(s); if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline s->img_y = stbi__get16be(s); if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG s->img_x = stbi__get16be(s); if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires c = stbi__get8(s); if (c != 3 && c != 1) return stbi__err("bad component count","Corrupt JPEG"); // JFIF requires s->img_n = c; for (i=0; i < c; ++i) { z->img_comp[i].data = NULL; z->img_comp[i].linebuf = NULL; } if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG"); for (i=0; i < s->img_n; ++i) { z->img_comp[i].id = stbi__get8(s); if (z->img_comp[i].id != i+1) // JFIF requires if (z->img_comp[i].id != i) // some version of jpegtran outputs non-JFIF-compliant files! return stbi__err("bad component ID","Corrupt JPEG"); q = stbi__get8(s); z->img_comp[i].h = (q >> 4); if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG"); z->img_comp[i].v = q & 15; if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG"); z->img_comp[i].tq = stbi__get8(s); if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG"); } if (scan != STBI__SCAN_load) return 1; if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode"); for (i=0; i < s->img_n; ++i) { if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h; if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v; } // compute interleaved mcu info z->img_h_max = h_max; z->img_v_max = v_max; z->img_mcu_w = h_max * 8; z->img_mcu_h = v_max * 8; z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w; z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h; for (i=0; i < s->img_n; ++i) { // number of effective pixels (e.g. for non-interleaved MCU) z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max; z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max; // to simplify generation, we'll allocate enough memory to decode // the bogus oversized data from using interleaved MCUs and their // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't // discard the extra data until colorspace conversion z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8; z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8; z->img_comp[i].raw_data = malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15); if (z->img_comp[i].raw_data == NULL) { for(--i; i >= 0; --i) { free(z->img_comp[i].raw_data); z->img_comp[i].data = NULL; } return stbi__err("outofmem", "Out of memory"); } // align blocks for idct using mmx/sse z->img_comp[i].data = (uint8_t*) (((size_t) z->img_comp[i].raw_data + 15) & ~15); z->img_comp[i].linebuf = NULL; if (z->progressive) { z->img_comp[i].coeff_w = (z->img_comp[i].w2 + 7) >> 3; z->img_comp[i].coeff_h = (z->img_comp[i].h2 + 7) >> 3; z->img_comp[i].raw_coeff = malloc(z->img_comp[i].coeff_w * z->img_comp[i].coeff_h * 64 * sizeof(short) + 15); z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15); } else { z->img_comp[i].coeff = 0; z->img_comp[i].raw_coeff = 0; } } return 1; } // use comparisons since in some cases we handle more than one case (e.g. SOF) #define stbi__DNL(x) ((x) == 0xdc) #define stbi__SOI(x) ((x) == 0xd8) #define stbi__EOI(x) ((x) == 0xd9) #define stbi__SOF(x) ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2) #define stbi__SOS(x) ((x) == 0xda) #define stbi__SOF_progressive(x) ((x) == 0xc2) static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan) { int m; z->marker = STBI__MARKER_none; // initialize cached marker to empty m = stbi__get_marker(z); if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG"); if (scan == STBI__SCAN_type) return 1; m = stbi__get_marker(z); while (!stbi__SOF(m)) { if (!stbi__process_marker(z,m)) return 0; m = stbi__get_marker(z); while (m == STBI__MARKER_none) { // some files have extra padding after their blocks, so ok, we'll scan if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG"); m = stbi__get_marker(z); } } z->progressive = stbi__SOF_progressive(m); if (!stbi__process_frame_header(z, scan)) return 0; return 1; } // decode image to YCbCr format static int stbi__decode_jpeg_image(stbi__jpeg *j) { int m; for (m = 0; m < 4; m++) { j->img_comp[m].raw_data = NULL; j->img_comp[m].raw_coeff = NULL; } j->restart_interval = 0; if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0; m = stbi__get_marker(j); while (!stbi__EOI(m)) { if (stbi__SOS(m)) { if (!stbi__process_scan_header(j)) return 0; if (!stbi__parse_entropy_coded_data(j)) return 0; if (j->marker == STBI__MARKER_none ) { // handle 0s at the end of image data from IP Kamera 9060 while (!stbi__at_eof(j->s)) { int x = stbi__get8(j->s); if (x == 255) { j->marker = stbi__get8(j->s); break; } else if (x != 0) { return stbi__err("junk before marker", "Corrupt JPEG"); } } // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0 } } else { if (!stbi__process_marker(j, m)) return 0; } m = stbi__get_marker(j); } if (j->progressive) stbi__jpeg_finish(j); return 1; } // static jfif-centered resampling (across block boundaries) typedef uint8_t *(*resample_row_func)(uint8_t *out, uint8_t *in0, uint8_t *in1, int w, int hs); #define stbi__div4(x) ((uint8_t) ((x) >> 2)) static uint8_t *resample_row_1(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs) { STBI_NOTUSED(out); STBI_NOTUSED(in_far); STBI_NOTUSED(w); STBI_NOTUSED(hs); return in_near; } static uint8_t* stbi__resample_row_v_2(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs) { // need to generate two samples vertically for every one in input int i; STBI_NOTUSED(hs); for (i=0; i < w; ++i) out[i] = stbi__div4(3*in_near[i] + in_far[i] + 2); return out; } static uint8_t* stbi__resample_row_h_2(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs) { // need to generate two samples horizontally for every one in input int i; uint8_t *input = in_near; if (w == 1) { // if only one sample, can't do any interpolation out[0] = out[1] = input[0]; return out; } out[0] = input[0]; out[1] = stbi__div4(input[0]*3 + input[1] + 2); for (i=1; i < w-1; ++i) { int n = 3*input[i]+2; out[i*2+0] = stbi__div4(n+input[i-1]); out[i*2+1] = stbi__div4(n+input[i+1]); } out[i*2+0] = stbi__div4(input[w-2]*3 + input[w-1] + 2); out[i*2+1] = input[w-1]; STBI_NOTUSED(in_far); STBI_NOTUSED(hs); return out; } #define stbi__div16(x) ((uint8_t) ((x) >> 4)) static uint8_t *stbi__resample_row_hv_2(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs) { // need to generate 2x2 samples for every one in input int i,t0,t1; if (w == 1) { out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2); return out; } t1 = 3*in_near[0] + in_far[0]; out[0] = stbi__div4(t1+2); for (i=1; i < w; ++i) { t0 = t1; t1 = 3*in_near[i]+in_far[i]; out[i*2-1] = stbi__div16(3*t0 + t1 + 8); out[i*2 ] = stbi__div16(3*t1 + t0 + 8); } out[w*2-1] = stbi__div4(t1+2); STBI_NOTUSED(hs); return out; } #if defined(STBI_SSE2) || defined(STBI_NEON) static uint8_t *stbi__resample_row_hv_2_simd(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs) { /* need to generate 2x2 samples for every one in input */ int i=0,t0,t1; if (w == 1) { out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2); return out; } t1 = 3*in_near[0] + in_far[0]; /* process groups of 8 pixels for as long as we can. * note we can't handle the last pixel in a row in this loop * because we need to handle the filter boundary conditions. */ for (; i < ((w-1) & ~7); i += 8) { #if defined(STBI_SSE2) /* load and perform the vertical filtering pass * this uses 3*x + y = 4*x + (y - x) */ __m128i zero = _mm_setzero_si128(); __m128i farb = _mm_loadl_epi64((__m128i *) (in_far + i)); __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i)); __m128i farw = _mm_unpacklo_epi8(farb, zero); __m128i nearw = _mm_unpacklo_epi8(nearb, zero); __m128i diff = _mm_sub_epi16(farw, nearw); __m128i nears = _mm_slli_epi16(nearw, 2); __m128i curr = _mm_add_epi16(nears, diff); /* current row */ /* horizontal filter works the same based on shifted vers of current * row. "prev" is current row shifted right by 1 pixel; we need to * insert the previous pixel value (from t1). * "next" is current row shifted left by 1 pixel, with first pixel * of next block of 8 pixels added in. */ __m128i prv0 = _mm_slli_si128(curr, 2); __m128i nxt0 = _mm_srli_si128(curr, 2); __m128i prev = _mm_insert_epi16(prv0, t1, 0); __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7); /* horizontal filter, polyphase implementation since it's convenient: * even pixels = 3*cur + prev = cur*4 + (prev - cur) * odd pixels = 3*cur + next = cur*4 + (next - cur) * note the shared term. */ __m128i bias = _mm_set1_epi16(8); __m128i curs = _mm_slli_epi16(curr, 2); __m128i prvd = _mm_sub_epi16(prev, curr); __m128i nxtd = _mm_sub_epi16(next, curr); __m128i curb = _mm_add_epi16(curs, bias); __m128i even = _mm_add_epi16(prvd, curb); __m128i odd = _mm_add_epi16(nxtd, curb); /* interleave even and odd pixels, then undo scaling. */ __m128i int0 = _mm_unpacklo_epi16(even, odd); __m128i int1 = _mm_unpackhi_epi16(even, odd); __m128i de0 = _mm_srli_epi16(int0, 4); __m128i de1 = _mm_srli_epi16(int1, 4); /* pack and write output */ __m128i outv = _mm_packus_epi16(de0, de1); _mm_storeu_si128((__m128i *) (out + i*2), outv); #elif defined(STBI_NEON) // load and perform the vertical filtering pass // this uses 3*x + y = 4*x + (y - x) uint8x8_t farb = vld1_u8(in_far + i); uint8x8_t nearb = vld1_u8(in_near + i); int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(farb, nearb)); int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2)); int16x8_t curr = vaddq_s16(nears, diff); // current row // horizontal filter works the same based on shifted vers of current // row. "prev" is current row shifted right by 1 pixel; we need to // insert the previous pixel value (from t1). // "next" is current row shifted left by 1 pixel, with first pixel // of next block of 8 pixels added in. int16x8_t prv0 = vextq_s16(curr, curr, 7); int16x8_t nxt0 = vextq_s16(curr, curr, 1); int16x8_t prev = vsetq_lane_s16(t1, prv0, 0); int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7); /* horizontal filter, polyphase implementation since it's convenient: * even pixels = 3*cur + prev = cur*4 + (prev - cur) * odd pixels = 3*cur + next = cur*4 + (next - cur) * note the shared term. */ int16x8_t curs = vshlq_n_s16(curr, 2); int16x8_t prvd = vsubq_s16(prev, curr); int16x8_t nxtd = vsubq_s16(next, curr); int16x8_t even = vaddq_s16(curs, prvd); int16x8_t odd = vaddq_s16(curs, nxtd); /* undo scaling and round, then store with even/odd phases interleaved */ uint8x8x2_t o; o.val[0] = vqrshrun_n_s16(even, 4); o.val[1] = vqrshrun_n_s16(odd, 4); vst2_u8(out + i*2, o); #endif /* "previous" value for next iteration */ t1 = 3*in_near[i+7] + in_far[i+7]; } t0 = t1; t1 = 3*in_near[i] + in_far[i]; out[i*2] = stbi__div16(3*t1 + t0 + 8); for (++i; i < w; ++i) { t0 = t1; t1 = 3*in_near[i]+in_far[i]; out[i*2-1] = stbi__div16(3*t0 + t1 + 8); out[i*2 ] = stbi__div16(3*t1 + t0 + 8); } out[w*2-1] = stbi__div4(t1+2); STBI_NOTUSED(hs); return out; } #endif static uint8_t *stbi__resample_row_generic(uint8_t *out, uint8_t *in_near, uint8_t *in_far, int w, int hs) { /* resample with nearest-neighbor */ int i,j; STBI_NOTUSED(in_far); for (i=0; i < w; ++i) for (j=0; j < hs; ++j) out[i*hs+j] = in_near[i]; return out; } #ifdef STBI_JPEG_OLD /* this is the same YCbCr-to-RGB calculation that stb_image has used * historically before the algorithm changes in 1.49 */ #define float2fixed(x) ((int) ((x) * 65536 + 0.5)) static void stbi__YCbCr_to_RGB_row(uint8_t *out, const uint8_t *y, const uint8_t *pcb, const uint8_t *pcr, int count, int step) { int i; for (i=0; i < count; ++i) { int y_fixed = (y[i] << 16) + 32768; // rounding int r,g,b; int cr = pcr[i] - 128; int cb = pcb[i] - 128; r = y_fixed + cr*float2fixed(1.40200f); g = y_fixed - cr*float2fixed(0.71414f) - cb*float2fixed(0.34414f); b = y_fixed + cb*float2fixed(1.77200f); r >>= 16; g >>= 16; b >>= 16; if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; } if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; } if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; } out[0] = (uint8_t)r; out[1] = (uint8_t)g; out[2] = (uint8_t)b; out[3] = 255; out += step; } } #else /* this is a reduced-precision calculation of YCbCr-to-RGB introduced * to make sure the code produces the same results in both SIMD and scalar */ #define float2fixed(x) (((int) ((x) * 4096.0f + 0.5f)) << 8) static void stbi__YCbCr_to_RGB_row(uint8_t *out, const uint8_t *y, const uint8_t *pcb, const uint8_t *pcr, int count, int step) { int i; for (i=0; i < count; ++i) { int y_fixed = (y[i] << 20) + (1<<19); /* rounding */ int r,g,b; int cr = pcr[i] - 128; int cb = pcb[i] - 128; r = y_fixed + cr* float2fixed(1.40200f); g = y_fixed + (cr*-float2fixed(0.71414f)) + ((cb*-float2fixed(0.34414f)) & 0xffff0000); b = y_fixed + cb* float2fixed(1.77200f); r >>= 20; g >>= 20; b >>= 20; if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; } if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; } if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; } out[0] = (uint8_t)r; out[1] = (uint8_t)g; out[2] = (uint8_t)b; out[3] = 255; out += step; } } #endif #if defined(STBI_SSE2) || defined(STBI_NEON) static void stbi__YCbCr_to_RGB_simd(uint8_t *out, const uint8_t *y, const uint8_t *pcb, const uint8_t *pcr, int count, int step) { int i = 0; #ifdef STBI_SSE2 /* step == 3 is pretty ugly on the final interleave, and i'm not convinced * it's useful in practice (you wouldn't use it for textures, for example). * so just accelerate step == 4 case. */ if (step == 4) { /* this is a fairly straightforward implementation and not super-optimized. */ __m128i signflip = _mm_set1_epi8(-0x80); __m128i cr_const0 = _mm_set1_epi16( (short) ( 1.40200f*4096.0f+0.5f)); __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f)); __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f)); __m128i cb_const1 = _mm_set1_epi16( (short) ( 1.77200f*4096.0f+0.5f)); __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128); __m128i xw = _mm_set1_epi16(255); /* alpha channel */ for (; i+7 < count; i += 8) { // load __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i)); __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i)); __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i)); __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128 __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128 // unpack to short (and left-shift cr, cb by 8) __m128i yw = _mm_unpacklo_epi8(y_bias, y_bytes); __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased); __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased); // color transform __m128i yws = _mm_srli_epi16(yw, 4); __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw); __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw); __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1); __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1); __m128i rws = _mm_add_epi16(cr0, yws); __m128i gwt = _mm_add_epi16(cb0, yws); __m128i bws = _mm_add_epi16(yws, cb1); __m128i gws = _mm_add_epi16(gwt, cr1); // descale __m128i rw = _mm_srai_epi16(rws, 4); __m128i bw = _mm_srai_epi16(bws, 4); __m128i gw = _mm_srai_epi16(gws, 4); // back to byte, set up for transpose __m128i brb = _mm_packus_epi16(rw, bw); __m128i gxb = _mm_packus_epi16(gw, xw); // transpose to interleave channels __m128i t0 = _mm_unpacklo_epi8(brb, gxb); __m128i t1 = _mm_unpackhi_epi8(brb, gxb); __m128i o0 = _mm_unpacklo_epi16(t0, t1); __m128i o1 = _mm_unpackhi_epi16(t0, t1); // store _mm_storeu_si128((__m128i *) (out + 0), o0); _mm_storeu_si128((__m128i *) (out + 16), o1); out += 32; } } #endif #ifdef STBI_NEON // in this version, step=3 support would be easy to add. but is there demand? if (step == 4) { // this is a fairly straightforward implementation and not super-optimized. uint8x8_t signflip = vdup_n_u8(0x80); int16x8_t cr_const0 = vdupq_n_s16( (short) ( 1.40200f*4096.0f+0.5f)); int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f)); int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f)); int16x8_t cb_const1 = vdupq_n_s16( (short) ( 1.77200f*4096.0f+0.5f)); for (; i+7 < count; i += 8) { // load uint8x8_t y_bytes = vld1_u8(y + i); uint8x8_t cr_bytes = vld1_u8(pcr + i); uint8x8_t cb_bytes = vld1_u8(pcb + i); int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip)); int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip)); // expand to s16 int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4)); int16x8_t crw = vshll_n_s8(cr_biased, 7); int16x8_t cbw = vshll_n_s8(cb_biased, 7); // color transform int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0); int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0); int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1); int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1); int16x8_t rws = vaddq_s16(yws, cr0); int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1); int16x8_t bws = vaddq_s16(yws, cb1); // undo scaling, round, convert to byte uint8x8x4_t o; o.val[0] = vqrshrun_n_s16(rws, 4); o.val[1] = vqrshrun_n_s16(gws, 4); o.val[2] = vqrshrun_n_s16(bws, 4); o.val[3] = vdup_n_u8(255); // store, interleaving r/g/b/a vst4_u8(out, o); out += 8*4; } } #endif for (; i < count; ++i) { int y_fixed = (y[i] << 20) + (1<<19); // rounding int r,g,b; int cr = pcr[i] - 128; int cb = pcb[i] - 128; r = y_fixed + cr* float2fixed(1.40200f); g = y_fixed + cr*-float2fixed(0.71414f) + ((cb*-float2fixed(0.34414f)) & 0xffff0000); b = y_fixed + cb* float2fixed(1.77200f); r >>= 20; g >>= 20; b >>= 20; if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; } if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; } if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; } out[0] = (uint8_t)r; out[1] = (uint8_t)g; out[2] = (uint8_t)b; out[3] = 255; out += step; } } #endif /* set up the kernels */ static void stbi__setup_jpeg(stbi__jpeg *j) { j->idct_block_kernel = stbi__idct_block; j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row; j->resample_row_hv_2_kernel = stbi__resample_row_hv_2; #ifdef STBI_SSE2 if (stbi__sse2_available()) { j->idct_block_kernel = stbi__idct_simd; #ifndef STBI_JPEG_OLD j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd; #endif j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd; } #endif #ifdef STBI_NEON j->idct_block_kernel = stbi__idct_simd; #ifndef STBI_JPEG_OLD j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd; #endif j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd; #endif } /* clean up the temporary component buffers */ static void stbi__cleanup_jpeg(stbi__jpeg *j) { int i; for (i=0; i < j->s->img_n; ++i) { if (j->img_comp[i].raw_data) { free(j->img_comp[i].raw_data); j->img_comp[i].raw_data = NULL; j->img_comp[i].data = NULL; } if (j->img_comp[i].raw_coeff) { free(j->img_comp[i].raw_coeff); j->img_comp[i].raw_coeff = 0; j->img_comp[i].coeff = 0; } if (j->img_comp[i].linebuf) { free(j->img_comp[i].linebuf); j->img_comp[i].linebuf = NULL; } } } typedef struct { resample_row_func resample; uint8_t *line0,*line1; int hs,vs; // expansion factor in each axis int w_lores; // horizontal pixels pre-expansion int ystep; // how far through vertical expansion we are int ypos; // which pre-expansion row we're on } stbi__resample; static uint8_t *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp) { int n, decode_n; z->s->img_n = 0; // make stbi__cleanup_jpeg safe // validate req_comp if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error"); // load a jpeg image from whichever source, but leave in YCbCr format if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; } // determine actual number of components to generate n = req_comp ? req_comp : z->s->img_n; if (z->s->img_n == 3 && n < 3) decode_n = 1; else decode_n = z->s->img_n; // resample and color-convert { int k; unsigned int i,j; uint8_t *output; uint8_t *coutput[4]; stbi__resample res_comp[4]; for (k=0; k < decode_n; ++k) { stbi__resample *r = &res_comp[k]; // allocate line buffer big enough for upsampling off the edges // with upsample factor of 4 z->img_comp[k].linebuf = (uint8_t *) malloc(z->s->img_x + 3); if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); } r->hs = z->img_h_max / z->img_comp[k].h; r->vs = z->img_v_max / z->img_comp[k].v; r->ystep = r->vs >> 1; r->w_lores = (z->s->img_x + r->hs-1) / r->hs; r->ypos = 0; r->line0 = r->line1 = z->img_comp[k].data; if (r->hs == 1 && r->vs == 1) r->resample = resample_row_1; else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2; else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2; else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel; else r->resample = stbi__resample_row_generic; } // can't error after this so, this is safe output = (uint8_t *) malloc(n * z->s->img_x * z->s->img_y + 1); if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); } // now go ahead and resample for (j=0; j < z->s->img_y; ++j) { uint8_t *out = output + n * z->s->img_x * j; for (k=0; k < decode_n; ++k) { stbi__resample *r = &res_comp[k]; int y_bot = r->ystep >= (r->vs >> 1); coutput[k] = r->resample(z->img_comp[k].linebuf, y_bot ? r->line1 : r->line0, y_bot ? r->line0 : r->line1, r->w_lores, r->hs); if (++r->ystep >= r->vs) { r->ystep = 0; r->line0 = r->line1; if (++r->ypos < z->img_comp[k].y) r->line1 += z->img_comp[k].w2; } } if (n >= 3) { uint8_t *y = coutput[0]; if (z->s->img_n == 3) { z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n); } else for (i=0; i < z->s->img_x; ++i) { out[0] = out[1] = out[2] = y[i]; out[3] = 255; // not used if n==3 out += n; } } else { uint8_t *y = coutput[0]; if (n == 1) for (i=0; i < z->s->img_x; ++i) out[i] = y[i]; else for (i=0; i < z->s->img_x; ++i) *out++ = y[i], *out++ = 255; } } stbi__cleanup_jpeg(z); *out_x = z->s->img_x; *out_y = z->s->img_y; if (comp) *comp = z->s->img_n; // report original components, not output return output; } } static unsigned char *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp) { stbi__jpeg j; j.s = s; stbi__setup_jpeg(&j); return load_jpeg_image(&j, x,y,comp,req_comp); } static int stbi__jpeg_test(stbi__context *s) { int r; stbi__jpeg j; j.s = s; stbi__setup_jpeg(&j); r = stbi__decode_jpeg_header(&j, STBI__SCAN_type); stbi__rewind(s); return r; } #endif bool rjpeg_image_load(uint8_t *_buf, void *data, size_t size, unsigned a_shift, unsigned r_shift, unsigned g_shift, unsigned b_shift) { unsigned i; int x, y, comp; struct texture_image *out_img = (struct texture_image*)data; out_img->pixels = stbi_load_from_memory(_buf, size, &x, &y, &comp, 4); out_img->width = x; out_img->height = y; #if 0 for (i = 0; i < (x * y); i++) { uint32_t r = (_buf[i] & 0xff00ff00); uint32_t g = ((_buf[i] << 16) & 0x00ff0000); uint32_t b = ((_buf[i] >> 16) & 0xff); if (r_shift == 0 && b_shift == 16) out_img->pixels[i] = _buf[i]; else out_img->pixels[i] = r | g | b; //out_img->pixels[i] = (r << r_shift) | (g << g_shift) || (b << b_shift); } #endif return true; }