mirror of
https://gitlab.com/OpenMW/openmw.git
synced 2025-01-30 12:32:36 +00:00
Added custom UTF8 converter. Removed iconv dependency.
This commit is contained in:
parent
9a5e7816eb
commit
358e1ca5a5
@ -44,6 +44,12 @@ set(NIFOGRE_HEADER
|
||||
${COMP_DIR}/nifogre/ogre_nif_loader.hpp)
|
||||
source_group(components\\nifogre FILES ${NIFOGRE} ${NIFOGRE_HEADER})
|
||||
|
||||
set(TO_UTF8
|
||||
${COMP_DIR}/to_utf8/to_utf8.cpp)
|
||||
set(TO_UTF8_HEADER
|
||||
${COMP_DIR}/to_utf8/to_utf8.hpp)
|
||||
source_group(components\\to_utf8 FILES ${TO_UTF8} ${TO_UTF8_HEADER})
|
||||
|
||||
set(ESM_STORE
|
||||
${COMP_DIR}/esm_store/store.cpp)
|
||||
set(ESM_STORE_HEADER
|
||||
@ -75,10 +81,10 @@ file(GLOB INTERPRETER ${COMP_DIR}/interpreter/*.cpp)
|
||||
file(GLOB INTERPRETER_HEADER ${COMP_DIR}/interpreter/*.hpp)
|
||||
source_group(components\\interpreter FILES ${INTERPRETER} ${INTERPRETER_HEADER})
|
||||
|
||||
set(COMPONENTS ${BSA} ${NIF} ${NIFOGRE} ${ESM_STORE} ${MISC}
|
||||
set(COMPONENTS ${BSA} ${NIF} ${NIFOGRE} ${ESM_STORE} ${MISC} ${TO_UTF8}
|
||||
${COMPILER} ${INTERPRETER} ${ESM})
|
||||
set(COMPONENTS_HEADER ${BSA_HEADER} ${NIF_HEADER} ${NIFOGRE_HEADER} ${ESM_STORE_HEADER}
|
||||
${ESM_HEADER} ${MISC_HEADER} ${COMPILER_HEADER}
|
||||
${ESM_HEADER} ${MISC_HEADER} ${COMPILER_HEADER} ${TO_UTF8_HEADER}
|
||||
${INTERPRETER_HEADER})
|
||||
|
||||
# source directory: libs
|
||||
@ -158,12 +164,10 @@ endif (WIN32)
|
||||
find_package(OGRE REQUIRED)
|
||||
find_package(Boost REQUIRED COMPONENTS system filesystem program_options thread)
|
||||
find_package(OIS REQUIRED)
|
||||
find_package(Iconv REQUIRED)
|
||||
find_package(OpenAL REQUIRED)
|
||||
include_directories("."
|
||||
${OGRE_INCLUDE_DIR} ${OGRE_INCLUDE_DIR}/Ogre
|
||||
${OIS_INCLUDE_DIR} ${Boost_INCLUDE_DIR}
|
||||
${ICONV_INCLUDE_DIR}
|
||||
${PLATFORM_INCLUDE_DIR}
|
||||
${CMAKE_HOME_DIRECTORY}/extern/caelum/include
|
||||
${CMAKE_HOME_DIRECTORY}/extern/mygui_3.0.1/MyGUIEngine/include
|
||||
|
@ -9,6 +9,7 @@ source_group(apps\\esmtool FILES ${ESMTOOL})
|
||||
add_executable(esmtool
|
||||
${ESMTOOL}
|
||||
${MISC} ${MISC_HEADER}
|
||||
${TO_UTF8}
|
||||
)
|
||||
|
||||
target_link_libraries(esmtool
|
||||
|
@ -199,7 +199,6 @@ target_link_libraries(openmw
|
||||
${Boost_LIBRARIES}
|
||||
${OPENAL_LIBRARY}
|
||||
${SOUND_INPUT_LIBRARY}
|
||||
${ICONV_LIBRARIES}
|
||||
caelum
|
||||
MyGUIEngine
|
||||
MyGUI.OgrePlatform
|
||||
|
@ -1,69 +0,0 @@
|
||||
# - Try to find Iconv
|
||||
# Once done this will define
|
||||
#
|
||||
# ICONV_FOUND - system has Iconv
|
||||
# ICONV_INCLUDE_DIR - the Iconv include directory
|
||||
# ICONV_LIBRARIES - Link these to use Iconv
|
||||
# ICONV_SECOND_ARGUMENT_IS_CONST - the second argument for iconv() is const
|
||||
#
|
||||
include(CheckCCompilerFlag)
|
||||
include(CheckCXXSourceCompiles)
|
||||
|
||||
IF (ICONV_INCLUDE_DIR AND ICONV_LIBRARIES)
|
||||
# Already in cache, be silent
|
||||
SET(ICONV_FIND_QUIETLY TRUE)
|
||||
ENDIF (ICONV_INCLUDE_DIR AND ICONV_LIBRARIES)
|
||||
|
||||
IF(WIN32)
|
||||
SET(ICONV_INCLUDE_DIR $ENV{ICONV_INCLUDE_DIR})
|
||||
SET(ICONV_LIBRARIES $ENV{ICONV_LIBRARIES})
|
||||
ENDIF(WIN32)
|
||||
|
||||
FIND_PATH(ICONV_INCLUDE_DIR iconv.h)
|
||||
|
||||
FIND_LIBRARY(ICONV_LIBRARIES NAMES iconv libiconv c)
|
||||
|
||||
IF(ICONV_INCLUDE_DIR AND ICONV_LIBRARIES)
|
||||
SET(ICONV_FOUND TRUE)
|
||||
ENDIF(ICONV_INCLUDE_DIR AND ICONV_LIBRARIES)
|
||||
|
||||
set(CMAKE_REQUIRED_INCLUDES ${ICONV_INCLUDE_DIR})
|
||||
set(CMAKE_REQUIRED_LIBRARIES ${ICONV_LIBRARIES})
|
||||
IF(ICONV_FOUND)
|
||||
check_c_compiler_flag("-Werror" ICONV_HAVE_WERROR)
|
||||
set (CMAKE_C_FLAGS_BACKUP "${CMAKE_C_FLAGS}")
|
||||
if(ICONV_HAVE_WERROR)
|
||||
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror")
|
||||
endif(ICONV_HAVE_WERROR)
|
||||
check_c_source_compiles("
|
||||
#include <iconv.h>
|
||||
int main(){
|
||||
iconv_t conv = 0;
|
||||
const char* in = 0;
|
||||
size_t ilen = 0;
|
||||
char* out = 0;
|
||||
size_t olen = 0;
|
||||
iconv(conv, &in, &ilen, &out, &olen);
|
||||
return 0;
|
||||
}
|
||||
" ICONV_SECOND_ARGUMENT_IS_CONST )
|
||||
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS_BACKUP}")
|
||||
ENDIF(ICONV_FOUND)
|
||||
set(CMAKE_REQUIRED_INCLUDES)
|
||||
set(CMAKE_REQUIRED_LIBRARIES)
|
||||
|
||||
IF(ICONV_FOUND)
|
||||
IF(NOT ICONV_FIND_QUIETLY)
|
||||
MESSAGE(STATUS "Found Iconv: ${ICONV_LIBRARIES}")
|
||||
ENDIF(NOT ICONV_FIND_QUIETLY)
|
||||
ELSE(ICONV_FOUND)
|
||||
IF(Iconv_FIND_REQUIRED)
|
||||
MESSAGE(FATAL_ERROR "Could not find Iconv")
|
||||
ENDIF(Iconv_FIND_REQUIRED)
|
||||
ENDIF(ICONV_FOUND)
|
||||
|
||||
MARK_AS_ADVANCED(
|
||||
ICONV_INCLUDE_DIR
|
||||
ICONV_LIBRARIES
|
||||
ICONV_SECOND_ARGUMENT_IS_CONST
|
||||
)
|
@ -3,22 +3,18 @@
|
||||
|
||||
#include <string>
|
||||
#include <libs/platform/stdint.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
#include <iomanip>
|
||||
#include <errno.h>
|
||||
|
||||
#ifndef __WIN32__
|
||||
#include <iconv.h>
|
||||
#endif
|
||||
#include <string.h>
|
||||
|
||||
#include <libs/mangle/stream/stream.hpp>
|
||||
#include <libs/mangle/stream/servers/file_stream.hpp>
|
||||
#include <libs/mangle/tools/str_exception.hpp>
|
||||
#include <components/misc/stringops.hpp>
|
||||
|
||||
#include <components/to_utf8/to_utf8.hpp>
|
||||
|
||||
#ifdef __APPLE__
|
||||
// need our own implementation of strnlen
|
||||
static size_t strnlen(const char *s, size_t n)
|
||||
@ -603,112 +599,17 @@ public:
|
||||
void getName(NAME &name) { getT(name); }
|
||||
void getUint(uint32_t &u) { getT(u); }
|
||||
|
||||
// Read the next size bytes and return them as a string
|
||||
// Read the next 'size' bytes and return them as a string. Converts
|
||||
// them from native encoding to UTF8 in the process.
|
||||
std::string getString(int size)
|
||||
{
|
||||
// Not very optimized, but we can fix that later
|
||||
char *ptr = new char[size];
|
||||
char *ptr = ToUTF8::getBuffer(size);
|
||||
esm->read(ptr,size);
|
||||
|
||||
// Remove any zero terminators
|
||||
for(int i=0; i<size; i++)
|
||||
if(ptr[i] == 0)
|
||||
size = i;
|
||||
|
||||
// Convert to std::string and return
|
||||
std::string res(ptr,size);
|
||||
delete[] ptr;
|
||||
return convertToUTF8(res);
|
||||
// Convert to UTF8 and return
|
||||
return ToUTF8::getUtf8(ToUTF8::WINDOWS_1252);
|
||||
}
|
||||
|
||||
// Convert a string from the encoding used by Morrowind to UTF-8
|
||||
std::string convertToUTF8 (std::string input)
|
||||
{
|
||||
#ifdef __WIN32__
|
||||
return input;
|
||||
#else
|
||||
std::string output = "";
|
||||
|
||||
//create convert description
|
||||
iconv_t cd = iconv_open ("UTF-8", "WINDOWS-1252");
|
||||
|
||||
if (cd == (iconv_t)-1) //error handling
|
||||
{
|
||||
std::string errMsg = "Creating description for UTF-8 converting failed: ";
|
||||
|
||||
switch (errno) //detailed error messages (maybe it contains too much detail :)
|
||||
{
|
||||
case EMFILE:
|
||||
errMsg += "{OPEN_MAX} files descriptors are currently open in the calling process.";
|
||||
case ENFILE:
|
||||
errMsg += "Too many files are currently open in the system.";
|
||||
case ENOMEM:
|
||||
errMsg +="Insufficient storage space is available.";
|
||||
case EINVAL:
|
||||
errMsg += "The conversion specified by fromcode and tocode is not supported by the implementation.";
|
||||
|
||||
default:
|
||||
errMsg += "Unknown Error\n";
|
||||
}
|
||||
|
||||
fail (errMsg);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
const size_t inputSize = input.size();
|
||||
|
||||
if (inputSize) //input is not empty
|
||||
{
|
||||
//convert function doesn't accept const char *, therefore copy content into an char *
|
||||
std::vector<char> inputBuffer (input.begin(), input.end());
|
||||
char *inputBufferBegin = &inputBuffer[0];
|
||||
|
||||
size_t inputBytesLeft = inputSize; //bytes to convert
|
||||
|
||||
static const size_t outputSize = 1000;
|
||||
size_t outputBytesLeft;
|
||||
|
||||
char outputBuffer[outputSize];
|
||||
char *outputBufferBegin;
|
||||
|
||||
while (inputBytesLeft > 0)
|
||||
{
|
||||
outputBytesLeft = outputSize;
|
||||
outputBufferBegin = outputBuffer;
|
||||
|
||||
if (iconv (cd, &inputBufferBegin, &inputBytesLeft, &outputBufferBegin, &outputBytesLeft) == (size_t)-1)
|
||||
{
|
||||
switch (errno)
|
||||
{
|
||||
case E2BIG: //outputBuffer is full
|
||||
output += std::string (outputBuffer, outputSize);
|
||||
break;
|
||||
case EILSEQ:
|
||||
fail ("Iconv: Invalid multibyte sequence.\n");
|
||||
break;
|
||||
case EINVAL:
|
||||
fail ("Iconv: Incomplete multibyte sequence.\n");
|
||||
break;
|
||||
default:
|
||||
fail ("Iconv: Unknown Error\n");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
//read only relevant bytes from outputBuffer
|
||||
output += std::string (outputBuffer, outputSize - outputBytesLeft);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
iconv_close (cd);
|
||||
|
||||
return output;
|
||||
}
|
||||
#endif
|
||||
|
||||
void skip(int bytes) { esm->seek(esm->tell()+bytes); }
|
||||
uint64_t getOffset() { return esm->tell(); }
|
||||
|
||||
|
1
components/to_utf8/.gitignore
vendored
Normal file
1
components/to_utf8/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
gen_iconv
|
5
components/to_utf8/Makefile
Normal file
5
components/to_utf8/Makefile
Normal file
@ -0,0 +1,5 @@
|
||||
tables_gen.hpp: gen_iconv
|
||||
gen_iconv > tables_gen.hpp
|
||||
|
||||
gen_iconv: gen_iconv.cpp
|
||||
g++ -Wall $^ -o $@
|
86
components/to_utf8/gen_iconv.cpp
Normal file
86
components/to_utf8/gen_iconv.cpp
Normal file
@ -0,0 +1,86 @@
|
||||
// This program generates the file tables_gen.hpp
|
||||
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
using namespace std;
|
||||
|
||||
#include <iconv.h>
|
||||
#include <assert.h>
|
||||
|
||||
void tab() { cout << " "; }
|
||||
|
||||
// write one number with a space in front of it and a comma after it
|
||||
void num(unsigned char i, bool last)
|
||||
{
|
||||
cout << " 0x" << (unsigned)i;
|
||||
if(!last) cout << ",";
|
||||
}
|
||||
|
||||
// Write one table entry (UTF8 value), 1-5 bytes
|
||||
void writeChar(char *value, int length, bool last, const std::string &comment="")
|
||||
{
|
||||
assert(length >= 1 && length <= 5);
|
||||
tab();
|
||||
num(length, false);
|
||||
for(int i=0;i<5;i++)
|
||||
num(value[i], last && i==4);
|
||||
|
||||
if(comment != "")
|
||||
cout << " // " << comment;
|
||||
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
// What to write on missing characters
|
||||
void writeMissing(bool last)
|
||||
{
|
||||
// Just write a space character
|
||||
char value[5];
|
||||
value[0] = ' ';
|
||||
for(int i=1; i<5; i++)
|
||||
value[i] = 0;
|
||||
writeChar(value, 1, last, "not part of this charset");
|
||||
}
|
||||
|
||||
int write_table(const std::string &charset, const std::string &tableName)
|
||||
{
|
||||
// Write table header
|
||||
cout << "static char " << tableName << "[] =\n{\n";
|
||||
|
||||
// Open conversion system
|
||||
iconv_t cd = iconv_open ("UTF-8", charset.c_str());
|
||||
|
||||
// Convert each character from 0 to 255
|
||||
for(int i=0; i<256; i++)
|
||||
{
|
||||
bool last = (i==255);
|
||||
|
||||
char input = i;
|
||||
char *iptr = &input;
|
||||
size_t ileft = 1;
|
||||
|
||||
char output[5];
|
||||
for(int k=0; k<5; k++) output[k] = 0;
|
||||
char *optr = output;
|
||||
size_t oleft = 5;
|
||||
|
||||
size_t res = iconv(cd, &iptr, &ileft, &optr, &oleft);
|
||||
|
||||
if(res) writeMissing(last);
|
||||
else writeChar(output, 5-oleft, last);
|
||||
}
|
||||
|
||||
iconv_close (cd);
|
||||
|
||||
// Finish table
|
||||
cout << "};\n";
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
cout << hex;
|
||||
|
||||
// English
|
||||
write_table("WINDOWS-1252", "windows_1252");
|
||||
return 0;
|
||||
}
|
259
components/to_utf8/tables_gen.hpp
Normal file
259
components/to_utf8/tables_gen.hpp
Normal file
@ -0,0 +1,259 @@
|
||||
static char windows_1252[] =
|
||||
{
|
||||
0x1, 0x0, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x1, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x2, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x3, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x4, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x5, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x6, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x7, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x8, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x9, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0xa, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0xb, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0xc, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0xd, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0xe, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0xf, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x10, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x11, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x12, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x13, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x14, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x15, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x16, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x17, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x18, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x19, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x1a, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x1b, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x1c, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x1d, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x1e, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x1f, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x20, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x21, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x22, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x23, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x24, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x25, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x26, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x27, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x28, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x29, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x2a, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x2b, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x2c, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x2d, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x2e, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x2f, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x30, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x31, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x32, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x33, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x34, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x35, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x36, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x37, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x38, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x39, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x3a, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x3b, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x3c, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x3d, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x3e, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x3f, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x40, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x41, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x42, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x43, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x44, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x45, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x46, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x47, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x48, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x49, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x4a, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x4b, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x4c, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x4d, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x4e, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x4f, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x50, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x51, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x52, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x53, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x54, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x55, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x56, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x57, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x58, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x59, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x5a, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x5b, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x5c, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x5d, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x5e, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x5f, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x60, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x61, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x62, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x63, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x64, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x65, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x66, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x67, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x68, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x69, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x6a, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x6b, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x6c, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x6d, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x6e, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x6f, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x70, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x71, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x72, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x73, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x74, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x75, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x76, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x77, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x78, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x79, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x7a, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x7b, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x7c, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x7d, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x7e, 0x0, 0x0, 0x0, 0x0,
|
||||
0x1, 0x7f, 0x0, 0x0, 0x0, 0x0,
|
||||
0x3, 0xe2, 0x82, 0xac, 0x0, 0x0,
|
||||
0x1, 0x20, 0x0, 0x0, 0x0, 0x0, // not part of this charset
|
||||
0x3, 0xe2, 0x80, 0x9a, 0x0, 0x0,
|
||||
0x2, 0xc6, 0x92, 0x0, 0x0, 0x0,
|
||||
0x3, 0xe2, 0x80, 0x9e, 0x0, 0x0,
|
||||
0x3, 0xe2, 0x80, 0xa6, 0x0, 0x0,
|
||||
0x3, 0xe2, 0x80, 0xa0, 0x0, 0x0,
|
||||
0x3, 0xe2, 0x80, 0xa1, 0x0, 0x0,
|
||||
0x2, 0xcb, 0x86, 0x0, 0x0, 0x0,
|
||||
0x3, 0xe2, 0x80, 0xb0, 0x0, 0x0,
|
||||
0x2, 0xc5, 0xa0, 0x0, 0x0, 0x0,
|
||||
0x3, 0xe2, 0x80, 0xb9, 0x0, 0x0,
|
||||
0x2, 0xc5, 0x92, 0x0, 0x0, 0x0,
|
||||
0x1, 0x20, 0x0, 0x0, 0x0, 0x0, // not part of this charset
|
||||
0x2, 0xc5, 0xbd, 0x0, 0x0, 0x0,
|
||||
0x1, 0x20, 0x0, 0x0, 0x0, 0x0, // not part of this charset
|
||||
0x1, 0x20, 0x0, 0x0, 0x0, 0x0, // not part of this charset
|
||||
0x3, 0xe2, 0x80, 0x98, 0x0, 0x0,
|
||||
0x3, 0xe2, 0x80, 0x99, 0x0, 0x0,
|
||||
0x3, 0xe2, 0x80, 0x9c, 0x0, 0x0,
|
||||
0x3, 0xe2, 0x80, 0x9d, 0x0, 0x0,
|
||||
0x3, 0xe2, 0x80, 0xa2, 0x0, 0x0,
|
||||
0x3, 0xe2, 0x80, 0x93, 0x0, 0x0,
|
||||
0x3, 0xe2, 0x80, 0x94, 0x0, 0x0,
|
||||
0x2, 0xcb, 0x9c, 0x0, 0x0, 0x0,
|
||||
0x3, 0xe2, 0x84, 0xa2, 0x0, 0x0,
|
||||
0x2, 0xc5, 0xa1, 0x0, 0x0, 0x0,
|
||||
0x3, 0xe2, 0x80, 0xba, 0x0, 0x0,
|
||||
0x2, 0xc5, 0x93, 0x0, 0x0, 0x0,
|
||||
0x1, 0x20, 0x0, 0x0, 0x0, 0x0, // not part of this charset
|
||||
0x2, 0xc5, 0xbe, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc5, 0xb8, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xa0, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xa1, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xa2, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xa3, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xa4, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xa5, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xa6, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xa7, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xa8, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xa9, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xaa, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xab, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xac, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xad, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xae, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xaf, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xb0, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xb1, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xb2, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xb3, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xb4, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xb5, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xb6, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xb7, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xb8, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xb9, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xba, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xbb, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xbc, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xbd, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xbe, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc2, 0xbf, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x80, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x81, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x82, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x83, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x84, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x85, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x86, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x87, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x88, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x89, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x8a, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x8b, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x8c, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x8d, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x8e, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x8f, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x90, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x91, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x92, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x93, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x94, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x95, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x96, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x97, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x98, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x99, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x9a, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x9b, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x9c, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x9d, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x9e, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0x9f, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xa0, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xa1, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xa2, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xa3, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xa4, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xa5, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xa6, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xa7, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xa8, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xa9, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xaa, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xab, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xac, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xad, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xae, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xaf, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xb0, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xb1, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xb2, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xb3, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xb4, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xb5, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xb6, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xb7, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xb8, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xb9, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xba, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xbb, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xbc, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xbd, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xbe, 0x0, 0x0, 0x0,
|
||||
0x2, 0xc3, 0xbf, 0x0, 0x0, 0x0
|
||||
};
|
159
components/to_utf8/to_utf8.cpp
Normal file
159
components/to_utf8/to_utf8.cpp
Normal file
@ -0,0 +1,159 @@
|
||||
#include "to_utf8.hpp"
|
||||
|
||||
#include <vector>
|
||||
#include <assert.h>
|
||||
|
||||
/* This file contains the code to translate from WINDOWS-1252 (native
|
||||
charset used in English version of Morrowind) to UTF-8. The library
|
||||
is designed to be extened to support more source encodings later,
|
||||
which means that we may add support for Russian, Polish and Chinese
|
||||
files and so on.
|
||||
|
||||
The code does not depend on any external library at
|
||||
runtime. Instead, it uses a pregenerated table made with iconv (see
|
||||
gen_iconv.cpp and the Makefile) which is located in tables_gen.hpp.
|
||||
|
||||
This is both faster and uses less dependencies. The tables would
|
||||
only need to be regenerated if we are adding support more input
|
||||
encodings. As such, there is no need to make the generator code
|
||||
platform independent.
|
||||
|
||||
The library is optimized for the case of pure ASCII input strings,
|
||||
which is the vast majority of cases at least for the English
|
||||
version. A test of my version of Morrowind.esm got 130 non-ASCII vs
|
||||
236195 ASCII strings, or less than 0.06% of strings containing
|
||||
non-ASCII characters.
|
||||
|
||||
To optmize for this, ff the first pass of the string does not find
|
||||
any non-ASCII characters, the entire string is passed along without
|
||||
any modification.
|
||||
|
||||
Most of the non-ASCII strings are books, and are quite large. (The
|
||||
non-ASCII characters are typically starting and ending quotation
|
||||
marks.) Within these, almost all the characters are ASCII. For this
|
||||
purpose, the library is also optimized for mostly-ASCII contents
|
||||
even in the cases where some conversion is necessary.
|
||||
*/
|
||||
|
||||
|
||||
// Generated tables
|
||||
#include "tables_gen.hpp"
|
||||
|
||||
// Shared global buffers, we love you.
|
||||
static std::vector<char> buf;
|
||||
static std::vector<char> output;
|
||||
static int size;
|
||||
|
||||
// Make sure the given vector is large enough for 'size' bytes,
|
||||
// including a terminating zero after it.
|
||||
static void resize(std::vector<char> &buf, size_t size)
|
||||
{
|
||||
if(buf.size() <= size)
|
||||
// Add some extra padding to reduce the chance of having to resize
|
||||
// again later.
|
||||
buf.resize(3*size);
|
||||
|
||||
// And make sure the string is zero terminated
|
||||
buf[size] = 0;
|
||||
}
|
||||
|
||||
// This is just used to spew out a reusable input buffer for the
|
||||
// conversion process.
|
||||
char *ToUTF8::getBuffer(int s)
|
||||
{
|
||||
// Remember the requested size
|
||||
size = s;
|
||||
resize(buf, size);
|
||||
return &buf[0];
|
||||
}
|
||||
|
||||
/** Get the total length length needed to decode the given string with
|
||||
the given translation array. The arrays are encoded with 6 bytes
|
||||
per character, with the first giving the length and the next 5 the
|
||||
actual data.
|
||||
|
||||
The function serves a dual purpose for optimization reasons: it
|
||||
checks if the input is pure ascii (all values are <= 127). If this
|
||||
is the case, then the ascii parameter is set to true, and the
|
||||
caller can optimize for this case.
|
||||
*/
|
||||
static size_t getLength(const char *arr, const char* input, bool &ascii)
|
||||
{
|
||||
ascii = true;
|
||||
size_t len = 0;
|
||||
unsigned char inp = *input;
|
||||
while(inp)
|
||||
{
|
||||
if(inp > 127) ascii = false;
|
||||
len += arr[inp*6];
|
||||
inp = *(++input);
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
// Translate one character 'ch' using the translation array 'arr', and
|
||||
// advance the output pointer accordingly.
|
||||
static void copyFromArray(const char *arr, unsigned char ch, char* &out)
|
||||
{
|
||||
// Optimize for ASCII values
|
||||
if(ch < 128)
|
||||
{
|
||||
*(out++) = ch;
|
||||
return;
|
||||
}
|
||||
|
||||
const char *in = arr + ch*6;
|
||||
int len = *(in++);
|
||||
for(int i=0; i<len; i++)
|
||||
*(out++) = *(in++);
|
||||
}
|
||||
|
||||
std::string ToUTF8::getUtf8(ToUTF8::FromType from)
|
||||
{
|
||||
// Pick translation array
|
||||
const char *arr;
|
||||
if(from == ToUTF8::WINDOWS_1252)
|
||||
arr = windows_1252;
|
||||
else
|
||||
assert(0);
|
||||
|
||||
// Double check that the input string stops at some point (it might
|
||||
// contain zero terminators before this, inside its own data, which
|
||||
// is also ok.)
|
||||
const char* input = &buf[0];
|
||||
assert(input[size] == 0);
|
||||
|
||||
// TODO: The rest of this function is designed for single-character
|
||||
// input encodings only. It also assumes that the input the input
|
||||
// encoding shares its first 128 values (0-127) with ASCII. These
|
||||
// conditions must be checked again if you add more input encodings
|
||||
// later.
|
||||
|
||||
// Compute output length, and check for pure ascii input at the same
|
||||
// time.
|
||||
bool ascii;
|
||||
size_t outlen = getLength(arr, input, ascii);
|
||||
|
||||
// If we're pure ascii, then don't bother converting anything.
|
||||
if(ascii)
|
||||
return std::string(input, outlen);
|
||||
|
||||
// Make sure the output is large enough
|
||||
resize(output, outlen);
|
||||
char *out = &output[0];
|
||||
|
||||
// Translate
|
||||
while(*input)
|
||||
copyFromArray(arr, *(input++), out);
|
||||
|
||||
// Make sure that we wrote the correct number of bytes
|
||||
assert((out-&output[0]) == (int)outlen);
|
||||
|
||||
// And make extra sure the output is null terminated
|
||||
assert(output.size() > outlen);
|
||||
assert(output[outlen] == 0);
|
||||
|
||||
// Return a string
|
||||
return std::string(&output[0], outlen);
|
||||
}
|
||||
|
24
components/to_utf8/to_utf8.hpp
Normal file
24
components/to_utf8/to_utf8.hpp
Normal file
@ -0,0 +1,24 @@
|
||||
#ifndef COMPONENTS_TOUTF8_H
|
||||
#define COMPONENTS_TOUTF8_H
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace ToUTF8
|
||||
{
|
||||
// These are all the currently supported code pages
|
||||
enum FromType
|
||||
{
|
||||
WINDOWS_1252 // Used by English version of Morrowind (and
|
||||
// probably others)
|
||||
};
|
||||
|
||||
// Return a writable buffer of at least 'size' bytes. The buffer
|
||||
// does not have to be freed.
|
||||
char* getBuffer(int size);
|
||||
|
||||
// Convert the previously written buffer to UTF8 from the given code
|
||||
// page.
|
||||
std::string getUtf8(FromType from);
|
||||
}
|
||||
|
||||
#endif
|
Loading…
x
Reference in New Issue
Block a user