cellL10n: Implement SJIS<->Unicode conversions.

This commit is contained in:
Andrew Church 2015-12-22 03:17:57 +09:00
parent 6811195345
commit 3055de0444

View File

@ -6,8 +6,11 @@
#include <windows.h>
#endif
#ifndef _MSC_VER
#ifdef _MSC_VER
typedef int HostCode;
#else
#include <iconv.h>
typedef const char *HostCode;
#endif
#include "cellL10n.h"
@ -16,73 +19,9 @@ extern Module<> cellL10n;
// Translate code id to code name. some codepage may has another name.
// If this makes your compilation fail, try replace the string code with one in "iconv -l"
bool _L10nCodeParse(s32 code, std::string& retCode)
{
if ((code >= _L10N_CODE_) || (code < 0)) return false;
switch (code)
{
// I don't know these Unicode Variants is LB or BE.
case L10N_UTF8: retCode = "UTF-8"; return true;
case L10N_UTF16: retCode = "UTF-16"; return true;
case L10N_UTF32: retCode = "UTF-32"; return true;
case L10N_UCS2: retCode = "UCS-2"; return true;
case L10N_UCS4: retCode = "UCS-4"; return true;
case L10N_ISO_8859_1: retCode = "ISO-8859-1"; return true;
case L10N_ISO_8859_2: retCode = "ISO-8859-2"; return true;
case L10N_ISO_8859_3: retCode = "ISO-8859-3"; return true;
case L10N_ISO_8859_4: retCode = "ISO-8859-4"; return true;
case L10N_ISO_8859_5: retCode = "ISO-8859-5"; return true;
case L10N_ISO_8859_6: retCode = "ISO-8859-6"; return true;
case L10N_ISO_8859_7: retCode = "ISO-8859-7"; return true;
case L10N_ISO_8859_8: retCode = "ISO-8859-8"; return true;
case L10N_ISO_8859_9: retCode = "ISO-8859-9"; return true;
case L10N_ISO_8859_10: retCode = "ISO-8859-10"; return true;
case L10N_ISO_8859_11: retCode = "ISO-8859-11"; return true;
case L10N_ISO_8859_13: retCode = "ISO-8859-13"; return true; // No ISO-8859-12 ha ha.
case L10N_ISO_8859_14: retCode = "ISO-8859-14"; return true;
case L10N_ISO_8859_15: retCode = "ISO-8859-15"; return true;
case L10N_ISO_8859_16: retCode = "ISO-8859-16"; return true;
case L10N_CODEPAGE_437: retCode = "CP437"; return true;
case L10N_CODEPAGE_850: retCode = "CP850"; return true;
case L10N_CODEPAGE_863: retCode = "CP863"; return true;
case L10N_CODEPAGE_866: retCode = "CP866"; return true;
case L10N_CODEPAGE_932: retCode = "CP932"; return true;
case L10N_CODEPAGE_936: retCode = "CP936"; return true;
case L10N_CODEPAGE_949: retCode = "CP949"; return true;
case L10N_CODEPAGE_950: retCode = "CP950"; return true;
case L10N_CODEPAGE_1251:retCode = "CP1251"; return true; // CYRL
case L10N_CODEPAGE_1252:retCode = "CP1252"; return true; // ANSI
case L10N_EUC_CN: retCode = "EUC-CN"; return true; // GB2312
case L10N_EUC_JP: retCode = "EUC-JP"; return true;
case L10N_EUC_KR: retCode = "EUC-KR"; return true;
case L10N_ISO_2022_JP: retCode = "ISO-2022-JP"; return true;
case L10N_ARIB: retCode = "ARABIC"; return true; // TODO: think that should be ARABIC.
case L10N_HZ: retCode = "HZ"; return true;
case L10N_GB18030: retCode = "GB18030"; return true;
case L10N_RIS_506: retCode = "SHIFT-JIS"; return true; // MusicShiftJIS, MS_KANJI
// These are only supported with FW 3.10 and above
case L10N_CODEPAGE_852: retCode = "CP852"; return true;
case L10N_CODEPAGE_1250:retCode = "CP1250"; return true; // EE
case L10N_CODEPAGE_737: retCode = "CP737"; return true;
case L10N_CODEPAGE_1253:retCode = "CP1253"; return true; // Greek
case L10N_CODEPAGE_857: retCode = "CP857"; return true;
case L10N_CODEPAGE_1254:retCode = "CP1254"; return true; // Turk
case L10N_CODEPAGE_775: retCode = "CP775"; return true;
case L10N_CODEPAGE_1257:retCode = "CP1257"; return true; // WINBALTRIM
case L10N_CODEPAGE_855: retCode = "CP855"; return true;
case L10N_CODEPAGE_858: retCode = "CP858"; return true;
case L10N_CODEPAGE_860: retCode = "CP860"; return true;
case L10N_CODEPAGE_861: retCode = "CP861"; return true;
case L10N_CODEPAGE_865: retCode = "CP865"; return true;
case L10N_CODEPAGE_869: retCode = "CP869"; return true;
default: return false;
}
}
// Translate code id to code name.
// If this makes your compilation fail, try replace the string code with one in "iconv -l"
bool _L10nCodeParse(s32 code, u32& retCode)
bool _L10nCodeParse(s32 code, HostCode& retCode)
{
#ifdef _MSC_VER
retCode = 0;
if ((code >= _L10N_CODE_) || (code < 0)) return false;
switch (code)
@ -144,13 +83,73 @@ bool _L10nCodeParse(s32 code, u32& retCode)
case L10N_CODEPAGE_869: retCode = 869; return true;
default: return false;
}
#else
if ((code >= _L10N_CODE_) || (code < 0)) return false;
switch (code)
{
// I don't know these Unicode Variants is LB or BE.
case L10N_UTF8: retCode = "UTF-8"; return true;
case L10N_UTF16: retCode = "UTF-16"; return true;
case L10N_UTF32: retCode = "UTF-32"; return true;
case L10N_UCS2: retCode = "UCS-2"; return true;
case L10N_UCS4: retCode = "UCS-4"; return true;
case L10N_ISO_8859_1: retCode = "ISO-8859-1"; return true;
case L10N_ISO_8859_2: retCode = "ISO-8859-2"; return true;
case L10N_ISO_8859_3: retCode = "ISO-8859-3"; return true;
case L10N_ISO_8859_4: retCode = "ISO-8859-4"; return true;
case L10N_ISO_8859_5: retCode = "ISO-8859-5"; return true;
case L10N_ISO_8859_6: retCode = "ISO-8859-6"; return true;
case L10N_ISO_8859_7: retCode = "ISO-8859-7"; return true;
case L10N_ISO_8859_8: retCode = "ISO-8859-8"; return true;
case L10N_ISO_8859_9: retCode = "ISO-8859-9"; return true;
case L10N_ISO_8859_10: retCode = "ISO-8859-10"; return true;
case L10N_ISO_8859_11: retCode = "ISO-8859-11"; return true;
case L10N_ISO_8859_13: retCode = "ISO-8859-13"; return true; // No ISO-8859-12 ha ha.
case L10N_ISO_8859_14: retCode = "ISO-8859-14"; return true;
case L10N_ISO_8859_15: retCode = "ISO-8859-15"; return true;
case L10N_ISO_8859_16: retCode = "ISO-8859-16"; return true;
case L10N_CODEPAGE_437: retCode = "CP437"; return true;
case L10N_CODEPAGE_850: retCode = "CP850"; return true;
case L10N_CODEPAGE_863: retCode = "CP863"; return true;
case L10N_CODEPAGE_866: retCode = "CP866"; return true;
case L10N_CODEPAGE_932: retCode = "CP932"; return true;
case L10N_CODEPAGE_936: retCode = "CP936"; return true;
case L10N_CODEPAGE_949: retCode = "CP949"; return true;
case L10N_CODEPAGE_950: retCode = "CP950"; return true;
case L10N_CODEPAGE_1251:retCode = "CP1251"; return true; // CYRL
case L10N_CODEPAGE_1252:retCode = "CP1252"; return true; // ANSI
case L10N_EUC_CN: retCode = "EUC-CN"; return true; // GB2312
case L10N_EUC_JP: retCode = "EUC-JP"; return true;
case L10N_EUC_KR: retCode = "EUC-KR"; return true;
case L10N_ISO_2022_JP: retCode = "ISO-2022-JP"; return true;
case L10N_ARIB: retCode = "ARABIC"; return true; // TODO: think that should be ARABIC.
case L10N_HZ: retCode = "HZ"; return true;
case L10N_GB18030: retCode = "GB18030"; return true;
case L10N_RIS_506: retCode = "SHIFT-JIS"; return true; // MusicShiftJIS, MS_KANJI
// These are only supported with FW 3.10 and above
case L10N_CODEPAGE_852: retCode = "CP852"; return true;
case L10N_CODEPAGE_1250:retCode = "CP1250"; return true; // EE
case L10N_CODEPAGE_737: retCode = "CP737"; return true;
case L10N_CODEPAGE_1253:retCode = "CP1253"; return true; // Greek
case L10N_CODEPAGE_857: retCode = "CP857"; return true;
case L10N_CODEPAGE_1254:retCode = "CP1254"; return true; // Turk
case L10N_CODEPAGE_775: retCode = "CP775"; return true;
case L10N_CODEPAGE_1257:retCode = "CP1257"; return true; // WINBALTRIM
case L10N_CODEPAGE_855: retCode = "CP855"; return true;
case L10N_CODEPAGE_858: retCode = "CP858"; return true;
case L10N_CODEPAGE_860: retCode = "CP860"; return true;
case L10N_CODEPAGE_861: retCode = "CP861"; return true;
case L10N_CODEPAGE_865: retCode = "CP865"; return true;
case L10N_CODEPAGE_869: retCode = "CP869"; return true;
default: return false;
}
#endif
}
// TODO: check and complete transforms. note: unicode to/from other Unicode Formats is needed.
#ifdef _MSC_VER
// Use code page to transform std::string to std::wstring.
s32 _OEM2Wide(u32 oem_code, const std::string src, std::wstring& dst)
s32 _OEM2Wide(HostCode oem_code, const std::string src, std::wstring& dst)
{
//Such length returned should include the '\0' character.
s32 length = MultiByteToWideChar(oem_code, 0, src.c_str(), -1, NULL, 0);
@ -167,7 +166,7 @@ s32 _OEM2Wide(u32 oem_code, const std::string src, std::wstring& dst)
}
// Use Code page to transform std::wstring to std::string.
s32 _Wide2OEM(u32 oem_code, const std::wstring src, std::string& dst)
s32 _Wide2OEM(HostCode oem_code, const std::wstring src, std::string& dst)
{
//Such length returned should include the '\0' character.
s32 length = WideCharToMultiByte(oem_code, 0, src.c_str(), -1, NULL, 0, NULL, NULL);
@ -184,7 +183,7 @@ s32 _Wide2OEM(u32 oem_code, const std::wstring src, std::string& dst)
}
// Convert Codepage to Codepage (all char*)
std::string _OemToOem(u32 src_code, u32 dst_code, const std::string str)
std::string _OemToOem(HostCode src_code, HostCode dst_code, const std::string str)
{
std::wstring wide; std::string result;
_OEM2Wide(src_code, str, wide);
@ -192,12 +191,11 @@ std::string _OemToOem(u32 src_code, u32 dst_code, const std::string str)
return result;
}
/*
// Original piece of code. and this is for windows using with _OEM2Wide,_Wide2OEM,_OemToOem.
// The Char -> Char Execution of this function has already been tested using VS and CJK text with encoding.
s32 _L10nConvertStr(s32 src_code, const void *src, size_t * src_len, s32 dst_code, void *dst, size_t * dst_len)
#endif
s32 _ConvertStr(s32 src_code, const void *src, s32 src_len, s32 dst_code, void *dst, s32 *dst_len, bool allowIncomplete)
{
u32 srcCode = 0, dstCode = 0; //OEM code pages
HostCode srcCode = 0, dstCode = 0; //OEM code pages
bool src_page_converted = _L10nCodeParse(src_code, srcCode); //Check if code is in list.
bool dst_page_converted = _L10nCodeParse(dst_code, dstCode);
@ -205,48 +203,63 @@ s32 _L10nConvertStr(s32 src_code, const void *src, size_t * src_len, s32 dst_cod
|| ((!dst_page_converted) && (dstCode == 0)))
return ConverterUnknown;
if (strnlen_s((char*)src, *src_len) != *src_len) return SRCIllegal;
//std::string wrapped_source = (char*)Memory.VirtualToRealAddr(src.addr());
std::string wrapped_source((char*)src);
//if (wrapped_source.length != src_len.GetValue()) return SRCIllegal;
#ifdef _MSC_VER
std::string wrapped_source = std::string(static_cast<const char *>(src), src_len);
std::string target = _OemToOem(srcCode, dstCode, wrapped_source);
if (target.length() > *dst_len) return DSTExhausted;
Memory.WriteString(dst.addr(), target);
memcpy(dst, target.c_str(), target.length());
*dst_len = target.length();
return ConversionOK;
}
// This is the one used with iconv library for linux/mac. Also char->char.
// I've tested the code with console apps using codeblocks.
s32 _L10nConvertStr(s32 src_code, const void* src, size_t * src_len, s32 dst_code, void * dst, size_t * dst_len)
{
std::string srcCode, dstCode;
#else
s32 retValue = ConversionOK;
if ((_L10nCodeParse(src_code, srcCode)) && (_L10nCodeParse(dst_code, dstCode)))
iconv_t ict = iconv_open(dstCode, srcCode);
size_t srcLen = src_len, dstLen = *dst_len;
size_t ictd = iconv(ict, (char **)&src, &srcLen, (char **)&dst, &dstLen);
*dst_len -= dstLen;
if (ictd == -1)
{
iconv_t ict = iconv_open(srcCode.c_str(), dstCode.c_str());
//char *srcBuf = (char*)Memory.VirtualToRealAddr(src.addr());
//char *dstBuf = (char*)Memory.VirtualToRealAddr(dst.addr());
char *srcBuf = (char*)src, *dstBuf = (char*)dst;
size_t srcLen = *src_len, dstLen = *dst_len;
size_t ictd = iconv(ict, &srcBuf, &srcLen, &dstBuf, &dstLen);
if (ictd != *src_len)
if (errno == EILSEQ)
retValue = SRCIllegal; //Invalid multi-byte sequence
else if (errno == E2BIG)
retValue = DSTExhausted;//Not enough space
else if (errno == EINVAL)
{
if (errno == EILSEQ)
retValue = SRCIllegal; //Invalid multi-byte sequence
else if (errno == E2BIG)
retValue = DSTExhausted;//Not enough space
else if (errno == EINVAL)
if (allowIncomplete)
*dst_len = -1; // TODO: correct value?
else
retValue = SRCIllegal;
}
iconv_close(ict);
//retValue = ConversionOK;
}
else retValue = ConverterUnknown;
iconv_close(ict);
return retValue;
}*/
#endif
}
s32 _L10nConvertStr(s32 src_code, vm::cptr<void> src, vm::cptr<s32> src_len, s32 dst_code, vm::ptr<void> dst, vm::ptr<s32> dst_len)
{
s32 dstLen = *dst_len;
s32 result = _ConvertStr(src_code, src.get_ptr(), *src_len, dst_code, dst.get_ptr(), &dstLen, false);
*dst_len = dstLen;
return result;
}
s32 _L10nConvertChar(s32 src_code, const void *src, s32 src_len, s32 dst_code, vm::ptr<void> dst, vm::ptr<s32> dst_len)
{
s32 dstLen = 0x7FFFFFFF;
s32 result = _ConvertStr(src_code, src, src_len, dst_code, dst.get_ptr(), &dstLen, true);
*dst_len = dstLen;
return result;
}
s32 _L10nConvertCharNoResult(s32 src_code, const void *src, s32 src_len, s32 dst_code, vm::ptr<void> dst)
{
s32 dstLen = 0x7FFFFFFF;
s32 result = _ConvertStr(src_code, src, src_len, dst_code, dst.get_ptr(), &dstLen, true);
return dstLen;
}
s32 UCS2toEUCJP()
{
@ -428,9 +441,10 @@ s32 ARIBstoUTF8s()
throw EXCEPTION("");
}
s32 SJISstoUTF8s()
s32 SJISstoUTF8s(vm::cptr<void> src, vm::cptr<s32> src_len, vm::ptr<void> dst, vm::ptr<s32> dst_len)
{
throw EXCEPTION("");
cellL10n.Warning("SJISstoUTF8s(src=*0x%x, src_len=*0x%x, dst=*0x%x, dst_len=*0x%x)", src, src_len, dst, dst_len);
return _L10nConvertStr(L10N_CODEPAGE_932, src, src_len, L10N_UTF8, dst, dst_len);
}
s32 sjiszen2han()
@ -533,9 +547,10 @@ s32 EUCKRtoUHC()
throw EXCEPTION("");
}
s32 UCS2toSJIS()
s32 UCS2toSJIS(u16 ch, vm::ptr<void> dst)
{
throw EXCEPTION("");
cellL10n.Warning("UCS2toSJIS(ch=%d, dst=*0x%x)", ch, dst);
return _L10nConvertCharNoResult(L10N_UTF8, &ch, sizeof(ch), L10N_CODEPAGE_932, dst);
}
s32 MSJISstoUTF8s()
@ -673,7 +688,7 @@ s32 UTF8toUTF32()
throw EXCEPTION("");
}
s32 jstrchk(vm::cptr<char> jstr)
s32 jstrchk(vm::cptr<void> jstr)
{
cellL10n.Warning("jstrchk(jstr=*0x%x) -> utf8", jstr);
@ -755,56 +770,10 @@ s32 UHCtoUCS2()
throw EXCEPTION("");
}
s32 L10nConvertStr(s32 src_code, vm::cptr<void> src, vm::ptr<u32> src_len, s32 dst_code, vm::ptr<void> dst, vm::ptr<u32> dst_len)
s32 L10nConvertStr(s32 src_code, vm::cptr<void> src, vm::ptr<s32> src_len, s32 dst_code, vm::ptr<void> dst, vm::ptr<s32> dst_len)
{
cellL10n.Error("L10nConvertStr(src_code=%d, srca=*0x%x, src_len=*0x%x, dst_code=%d, dst=*0x%x, dst_len=*0x%x)", src_code, src, src_len, dst_code, dst, dst_len);
#ifdef _MSC_VER
u32 srcCode = 0, dstCode = 0; //OEM code pages
bool src_page_converted = _L10nCodeParse(src_code, srcCode); //Check if code is in list.
bool dst_page_converted = _L10nCodeParse(dst_code, dstCode);
if (((!src_page_converted) && (srcCode == 0))
|| ((!dst_page_converted) && (dstCode == 0)))
return ConverterUnknown;
//if (strnlen_s((char*)src, *src_len) != *src_len) return SRCIllegal;
std::string wrapped_source = (char*)src.get_ptr();
//std::string wrapped_source((char*)src);
if (wrapped_source.length() != *src_len) return SRCIllegal;
std::string target = _OemToOem(srcCode, dstCode, wrapped_source);
if (target.length() > *dst_len) return DSTExhausted;
memcpy(dst.get_ptr(), target.c_str(), target.size());
return ConversionOK;
#else
std::string srcCode, dstCode;
s32 retValue = ConversionOK;
if ((_L10nCodeParse(src_code, srcCode)) && (_L10nCodeParse(dst_code, dstCode)))
{
iconv_t ict = iconv_open(srcCode.c_str(), dstCode.c_str());
char *srcBuf = (char*)src.get_ptr();
char *dstBuf = (char*)dst.get_ptr();
//char *srcBuf = (char*)src, *dstBuf = (char*)dst;
//size_t srcLen = *src_len, dstLen = *dst_len;
size_t srcLen = *src_len, dstLen = *dst_len;
size_t ictd = iconv(ict, &srcBuf, &srcLen, &dstBuf, &dstLen);
if (ictd != *src_len)//if (ictd != *src_len)
{
if (errno == EILSEQ)
retValue = SRCIllegal; //Invalid multi-byte sequence
else if (errno == E2BIG)
retValue = DSTExhausted;//Not enough space
else if (errno == EINVAL)
retValue = SRCIllegal;
}
iconv_close(ict);
//retValue = ConversionOK;
}
else retValue = ConverterUnknown;
return retValue;
#endif
cellL10n.Error("L10nConvertStr(src_code=%d, src=*0x%x, src_len=*0x%x, dst_code=%d, dst=*0x%x, dst_len=*0x%x)", src_code, src, src_len, dst_code, dst, dst_len);
return _L10nConvertStr(src_code, src, src_len, dst_code, dst, dst_len);
}
s32 GBKstoUTF8s()
@ -887,14 +856,14 @@ s32 UTF16toUTF32()
throw EXCEPTION("");
}
s32 l10n_convert_str(s32 cd, vm::cptr<void> src, vm::ptr<u32> src_len, vm::ptr<void> dst, vm::ptr<u32> dst_len)
s32 l10n_convert_str(s32 cd, vm::cptr<void> src, vm::ptr<s32> src_len, vm::ptr<void> dst, vm::ptr<s32> dst_len)
{
cellL10n.Warning("l10n_convert_str(cd=%d, src=*0x%x, src_len=*0x%x, dst=*0x%x, dst_len=*0x%x)", cd, src, src_len, dst, dst_len);
s32 src_code = cd >> 16;
s32 dst_code = cd & 0xffff;
return L10nConvertStr(src_code, src, src_len, dst_code, dst, dst_len);
return _L10nConvertStr(src_code, src, src_len, dst_code, dst, dst_len);
}
s32 EUCJPstoJISs()
@ -1058,7 +1027,7 @@ s32 UTF8toBIG5()
throw EXCEPTION("");
}
s32 UTF16stoUTF8s(vm::cptr<u16> utf16, vm::ref<u32> utf16_len, vm::ptr<u8> utf8, vm::ref<u32> utf8_len)
s32 UTF16stoUTF8s(vm::cptr<u16> utf16, vm::ref<s32> utf16_len, vm::ptr<u8> utf8, vm::ref<s32> utf8_len)
{
cellL10n.Error("UTF16stoUTF8s(utf16=*0x%x, utf16_len=*0x%x, utf8=*0x%x, utf8_len=*0x%x)", utf16, utf16_len, utf8, utf8_len);
@ -1110,9 +1079,10 @@ s32 GB18030toUTF8()
throw EXCEPTION("");
}
s32 UTF8toSJIS()
s32 UTF8toSJIS(u8 ch, vm::ptr<void> dst, vm::ptr<s32> dst_len)
{
throw EXCEPTION("");
cellL10n.Warning("UTF8toSJIS(ch=%d, dst=*0x%x, dst_len=*0x%x)", ch, dst, dst_len);
return _L10nConvertChar(L10N_UTF8, &ch, sizeof(ch), L10N_CODEPAGE_932, dst, dst_len);
}
s32 ARIBstoUCS2s()
@ -1150,9 +1120,10 @@ s32 UTF8stoUTF16s()
throw EXCEPTION("");
}
s32 SJISstoUCS2s()
s32 SJISstoUCS2s(vm::cptr<void> src, vm::cptr<s32> src_len, vm::ptr<void> dst, vm::ptr<s32> dst_len)
{
throw EXCEPTION("");
cellL10n.Warning("SJISstoUCS2s(src=*0x%x, src_len=*0x%x, dst=*0x%x, dst_len=*0x%x)", src, src_len, dst, dst_len);
return _L10nConvertStr(L10N_CODEPAGE_932, src, src_len, L10N_UCS2, dst, dst_len);
}
s32 BIG5stoUCS2s()