/* Copyright (C) 2010-2017 The RetroArch team * * --------------------------------------------------------------------------------------- * The following license statement only applies to this file (encoding_utf.c). * --------------------------------------------------------------------------------------- * * Permission is hereby granted, free of charge, * to any person obtaining a copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation the rights to * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include #include #if defined(_WIN32) && !defined(_XBOX) #include #elif defined(_XBOX) #include #endif static unsigned leading_ones(uint8_t c) { unsigned ones = 0; while (c & 0x80) { ones++; c <<= 1; } return ones; } /* Simple implementation. Assumes the sequence is * properly synchronized and terminated. */ size_t utf8_conv_utf32(uint32_t *out, size_t out_chars, const char *in, size_t in_size) { unsigned i; size_t ret = 0; while (in_size && out_chars) { unsigned extra, shift; uint32_t c; uint8_t first = *in++; unsigned ones = leading_ones(first); if (ones > 6 || ones == 1) /* Invalid or desync. */ break; extra = ones ? ones - 1 : ones; if (1 + extra > in_size) /* Overflow. */ break; shift = (extra - 1) * 6; c = (first & ((1 << (7 - ones)) - 1)) << (6 * extra); for (i = 0; i < extra; i++, in++, shift -= 6) c |= (*in & 0x3f) << shift; *out++ = c; in_size -= 1 + extra; out_chars--; ret++; } return ret; } bool utf16_conv_utf8(uint8_t *out, size_t *out_chars, const uint16_t *in, size_t in_size) { static uint8_t kUtf8Limits[5] = { 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; size_t out_pos = 0; size_t in_pos = 0; for (;;) { unsigned numAdds; uint32_t value; if (in_pos == in_size) { *out_chars = out_pos; return true; } value = in[in_pos++]; if (value < 0x80) { if (out) out[out_pos] = (char)value; out_pos++; continue; } if (value >= 0xD800 && value < 0xE000) { uint32_t c2; if (value >= 0xDC00 || in_pos == in_size) break; c2 = in[in_pos++]; if (c2 < 0xDC00 || c2 >= 0xE000) break; value = (((value - 0xD800) << 10) | (c2 - 0xDC00)) + 0x10000; } for (numAdds = 1; numAdds < 5; numAdds++) if (value < (((uint32_t)1) << (numAdds * 5 + 6))) break; if (out) out[out_pos] = (char)(kUtf8Limits[numAdds - 1] + (value >> (6 * numAdds))); out_pos++; do { numAdds--; if (out) out[out_pos] = (char)(0x80 + ((value >> (6 * numAdds)) & 0x3F)); out_pos++; }while (numAdds != 0); } *out_chars = out_pos; return false; } /* Acts mostly like strlcpy. * * Copies the given number of UTF-8 characters, * but at most d_len bytes. * * Always NULL terminates. * Does not copy half a character. * * Returns number of bytes. 's' is assumed valid UTF-8. * Use only if 'chars' is considerably less than 'd_len'. */ size_t utf8cpy(char *d, size_t d_len, const char *s, size_t chars) { const uint8_t *sb = (const uint8_t*)s; const uint8_t *sb_org = sb; if (!s) return 0; while (*sb && chars-- > 0) { sb++; while ((*sb & 0xC0) == 0x80) sb++; } if ((size_t)(sb - sb_org) > d_len-1 /* NUL */) { sb = sb_org + d_len-1; while ((*sb & 0xC0) == 0x80) sb--; } memcpy(d, sb_org, sb-sb_org); d[sb-sb_org] = '\0'; return sb-sb_org; } const char *utf8skip(const char *str, size_t chars) { const uint8_t *strb = (const uint8_t*)str; if (!chars) return str; do { strb++; while ((*strb & 0xC0)==0x80) strb++; chars--; } while(chars); return (const char*)strb; } size_t utf8len(const char *string) { size_t ret = 0; if (!string) return 0; while (*string) { if ((*string & 0xC0) != 0x80) ret++; string++; } return ret; } static uint8_t utf8_walkbyte(const char **string) { return *((*string)++); } /* Does not validate the input, returns garbage if it's not UTF-8. */ uint32_t utf8_walk(const char **string) { uint8_t first = utf8_walkbyte(string); uint32_t ret = 0; if (first < 128) return first; ret = (ret << 6) | (utf8_walkbyte(string) & 0x3F); if (first >= 0xE0) ret = (ret << 6) | (utf8_walkbyte(string) & 0x3F); if (first >= 0xF0) ret = (ret << 6) | (utf8_walkbyte(string) & 0x3F); if (first >= 0xF0) return ret | (first & 7) << 18; if (first >= 0xE0) return ret | (first & 15) << 12; return ret | (first & 31) << 6; } static bool utf16_to_char(uint8_t **utf_data, size_t *dest_len, const uint16_t *in) { unsigned len = 0; while (in[len] != '\0') len++; utf16_conv_utf8(NULL, dest_len, in, len); *dest_len += 1; *utf_data = (uint8_t*)malloc(*dest_len); if (*utf_data == 0) return false; return utf16_conv_utf8(*utf_data, dest_len, in, len); } bool utf16_to_char_string(const uint16_t *in, char *s, size_t len) { size_t dest_len = 0; uint8_t *utf16_data = NULL; bool ret = utf16_to_char(&utf16_data, &dest_len, in); if (ret) { utf16_data[dest_len] = 0; strlcpy(s, (const char*)utf16_data, len); } free(utf16_data); utf16_data = NULL; return ret; } /* Returned pointer MUST be freed by the caller if non-NULL. */ static char* mb_to_mb_string_alloc(const char *str, enum CodePage cp_in, enum CodePage cp_out) { char *path_buf = NULL; wchar_t *path_buf_wide = NULL; int path_buf_len = 0; int path_buf_wide_len = 0; if (!str || !*str) return NULL; (void)path_buf; (void)path_buf_wide; (void)path_buf_len; (void)path_buf_wide_len; #if !defined(_WIN32) || defined(_XBOX) /* assume string needs no modification if not on Windows */ return strdup(str); #else #ifdef UNICODE /* TODO/FIXME: Not implemented. */ return strdup(str); #else /* Windows 95 will return 0 from these functions with a UTF8 codepage set without MSLU. From an unknown MSDN version (others omit this info): * - CP_UTF8 Windows 98/Me, Windows NT 4.0 and later: Translate using UTF-8. When this is set, dwFlags must be zero. * - Windows 95: Under the Microsoft Layer for Unicode, MultiByteToWideChar also supports CP_UTF7 and CP_UTF8. */ path_buf_wide_len = MultiByteToWideChar(cp_in, 0, str, -1, NULL, 0); if (path_buf_wide_len) { path_buf_wide = (wchar_t*) calloc(path_buf_wide_len + sizeof(wchar_t), sizeof(wchar_t)); if (path_buf_wide) { MultiByteToWideChar(cp_in, 0, str, -1, path_buf_wide, path_buf_wide_len); if (*path_buf_wide) { path_buf_len = WideCharToMultiByte(cp_out, 0, path_buf_wide, -1, NULL, 0, NULL, NULL); if (path_buf_len) { path_buf = (char*) calloc(path_buf_len + sizeof(char), sizeof(char)); if (path_buf) { WideCharToMultiByte(cp_out, 0, path_buf_wide, -1, path_buf, path_buf_len, NULL, NULL); free(path_buf_wide); if (*path_buf) return path_buf; free(path_buf); return NULL; } } else { free(path_buf_wide); return strdup(str); } } } } else return strdup(str); if (path_buf_wide) free(path_buf_wide); return NULL; #endif #endif } /* Returned pointer MUST be freed by the caller if non-NULL. */ char* utf8_to_local_string_alloc(const char *str) { return mb_to_mb_string_alloc(str, CODEPAGE_UTF8, CODEPAGE_LOCAL); } /* Returned pointer MUST be freed by the caller if non-NULL. */ char* local_to_utf8_string_alloc(const char *str) { return mb_to_mb_string_alloc(str, CODEPAGE_LOCAL, CODEPAGE_UTF8); } /* Returned pointer MUST be freed by the caller if non-NULL. */ wchar_t* utf8_to_utf16_string_alloc(const char *str) { #ifdef _WIN32 int len = 0; int out_len = 0; #else size_t len = 0; size_t out_len = 0; #endif wchar_t *buf = NULL; if (!str || !*str) return NULL; #ifdef _WIN32 len = MultiByteToWideChar(CP_UTF8, 0, str, -1, NULL, 0); if (len) { buf = (wchar_t*)calloc(len, sizeof(wchar_t)); if (!buf) return NULL; out_len = MultiByteToWideChar(CP_UTF8, 0, str, -1, buf, len); } else { /* fallback to ANSI codepage instead */ len = MultiByteToWideChar(CP_ACP, 0, str, -1, NULL, 0); if (len) { buf = (wchar_t*)calloc(len, sizeof(wchar_t)); if (!buf) return NULL; out_len = MultiByteToWideChar(CP_ACP, 0, str, -1, buf, len); } } if (out_len < 0) { free(buf); return NULL; } #else /* NOTE: For now, assume non-Windows platforms' locale is already UTF-8. */ len = mbstowcs(NULL, str, 0) + 1; if (len) { buf = (wchar_t*)calloc(len, sizeof(wchar_t)); if (!buf) return NULL; out_len = mbstowcs(buf, str, len); } if (out_len == (size_t)-1) { free(buf); return NULL; } #endif return buf; } /* Returned pointer MUST be freed by the caller if non-NULL. */ char* utf16_to_utf8_string_alloc(const wchar_t *str) { #ifdef _WIN32 int len = 0; int out_len = 0; #else size_t len = 0; size_t out_len = 0; #endif char *buf = NULL; if (!str || !*str) return NULL; #ifdef _WIN32 len = WideCharToMultiByte(CP_UTF8, 0, str, -1, NULL, 0, NULL, NULL); if (len) { buf = (char*)calloc(len, sizeof(char)); if (!buf) return NULL; out_len = WideCharToMultiByte(CP_UTF8, 0, str, -1, buf, len, NULL, NULL); } else { /* fallback to ANSI codepage instead */ len = WideCharToMultiByte(CP_ACP, 0, str, -1, NULL, 0, NULL, NULL); if (len) { buf = (char*)calloc(len, sizeof(char)); if (!buf) return NULL; out_len = WideCharToMultiByte(CP_ACP, 0, str, -1, buf, len, NULL, NULL); } } if (out_len < 0) { free(buf); return NULL; } #else /* NOTE: For now, assume non-Windows platforms' locale is already UTF-8. */ len = wcstombs(NULL, str, 0) + 1; if (len) { buf = (char*)calloc(len, sizeof(char)); if (!buf) return NULL; out_len = wcstombs(buf, str, len); } if (out_len == (size_t)-1) { free(buf); return NULL; } #endif return buf; }