From 358b7ad3ec7d27f4a8291adde7f19eac904d3ddc Mon Sep 17 00:00:00 2001 From: elsid Date: Mon, 22 May 2023 18:31:08 +0200 Subject: [PATCH 1/3] Keep ESM4 localized strings in memory Size of the files is in order of megabytes at max. Storing offset lookup table to read from file on demand is less efficient than reading from memory for such size. Read and store offsets first. Sort them to read values sequentially. Memoize last offset and value to avoid reading the same value twice. Use seek only when current possition does not match offset. Optimize seek for short distance by calling read instead. --- apps/esmtool/tes4.cpp | 3 +- apps/openmw/mwworld/esmloader.cpp | 5 +- components/esm4/reader.cpp | 221 ++++++++++++++++++------------ components/esm4/reader.hpp | 47 +++---- 4 files changed, 155 insertions(+), 121 deletions(-) diff --git a/apps/esmtool/tes4.cpp b/apps/esmtool/tes4.cpp index 904040428d..8760e33dfb 100644 --- a/apps/esmtool/tes4.cpp +++ b/apps/esmtool/tes4.cpp @@ -526,8 +526,7 @@ namespace EsmTool try { const ToUTF8::StatelessUtf8Encoder encoder(ToUTF8::calculateEncoding(info.encoding)); - ESM4::Reader reader(std::move(stream), info.filename); - reader.setEncoder(&encoder); + ESM4::Reader reader(std::move(stream), info.filename, nullptr, &encoder); const Params params(info); if (!params.mQuite) diff --git a/apps/openmw/mwworld/esmloader.cpp b/apps/openmw/mwworld/esmloader.cpp index 9190d53ea5..e586a4c204 100644 --- a/apps/openmw/mwworld/esmloader.cpp +++ b/apps/openmw/mwworld/esmloader.cpp @@ -64,9 +64,8 @@ namespace MWWorld } case ESM::Format::Tes4: { - ESM4::Reader readerESM4( - std::move(stream), filepath, MWBase::Environment::get().getResourceSystem()->getVFS()); - readerESM4.setEncoder(mReaders.getStatelessEncoder()); + ESM4::Reader readerESM4(std::move(stream), filepath, + MWBase::Environment::get().getResourceSystem()->getVFS(), mReaders.getStatelessEncoder()); readerESM4.setModIndex(index); readerESM4.updateModIndices(mNameToIndex); mStore.loadESM4(readerESM4); diff --git a/components/esm4/reader.cpp b/components/esm4/reader.cpp index ff578b23a4..8ce5b1f4d2 100644 --- a/components/esm4/reader.cpp +++ b/components/esm4/reader.cpp @@ -46,6 +46,7 @@ #include #include +#include #include #include #include @@ -56,6 +57,23 @@ namespace ESM4 { + namespace + { + std::u8string_view getStringsSuffix(LocalizedStringType type) + { + switch (type) + { + case LocalizedStringType::Strings: + return u8"_English.STRINGS"; + case LocalizedStringType::ILStrings: + return u8"_English.ILSTRINGS"; + case LocalizedStringType::DLStrings: + return u8"_English.DLSTRINGS"; + } + + throw std::logic_error("Unsupported LocalizedStringType: " + std::to_string(static_cast(type))); + } + } ReaderContext::ReaderContext() : modIndex(0) @@ -72,9 +90,10 @@ namespace ESM4 subRecordHeader.dataSize = 0; } - Reader::Reader(Files::IStreamPtr&& esmStream, const std::filesystem::path& filename, VFS::Manager const* vfs) + Reader::Reader(Files::IStreamPtr&& esmStream, const std::filesystem::path& filename, VFS::Manager const* vfs, + const ToUTF8::StatelessUtf8Encoder* encoder) : mVFS(vfs) - , mEncoder(nullptr) + , mEncoder(encoder) , mFileSize(0) , mStream(std::move(esmStream)) { @@ -209,58 +228,120 @@ namespace ESM4 if ((mHeader.mFlags & Rec_ESM) == 0 || (mHeader.mFlags & Rec_Localized) == 0) return; - const auto filename = mCtx.filename.stem().filename().u8string(); + const std::u8string prefix = mCtx.filename.stem().filename().u8string(); - static const std::filesystem::path s("Strings"); - buildLStringIndex(s / (filename + u8"_English.STRINGS"), Type_Strings); - buildLStringIndex(s / (filename + u8"_English.ILSTRINGS"), Type_ILStrings); - buildLStringIndex(s / (filename + u8"_English.DLSTRINGS"), Type_DLStrings); + buildLStringIndex(LocalizedStringType::Strings, prefix); + buildLStringIndex(LocalizedStringType::ILStrings, prefix); + buildLStringIndex(LocalizedStringType::DLStrings, prefix); } - void Reader::buildLStringIndex(const std::filesystem::path& stringFile, LocalizedStringType stringType) + void Reader::buildLStringIndex(LocalizedStringType stringType, const std::u8string& prefix) { - std::uint32_t numEntries; - std::uint32_t dataSize; - std::uint32_t stringId; - LStringOffset sp; - sp.type = stringType; + static const std::filesystem::path strings("Strings"); + const std::u8string suffix(getStringsSuffix(stringType)); + std::filesystem::path path = strings / (prefix + suffix); - // TODO: possibly check if the resource exists? - Files::IStreamPtr filestream = mVFS - ? mVFS->get(stringFile.string()) - : Files::openConstrainedFileStream(mCtx.filename.parent_path() / stringFile); - - filestream->seekg(0, std::ios::end); - std::size_t fileSize = filestream->tellg(); - filestream->seekg(0, std::ios::beg); - - std::istream* stream = filestream.get(); - switch (stringType) + if (mVFS != nullptr) { - case Type_Strings: - mStrings = std::move(filestream); - break; - case Type_ILStrings: - mILStrings = std::move(filestream); - break; - case Type_DLStrings: - mDLStrings = std::move(filestream); - break; - default: - throw std::runtime_error("ESM4::Reader::unknown localised string type"); + const Files::IStreamPtr stream = mVFS->get(Files::pathToUnicodeString(path)); + buildLStringIndex(stringType, *stream); + return; } - stream->read((char*)&numEntries, sizeof(numEntries)); - stream->read((char*)&dataSize, sizeof(dataSize)); - std::size_t dataStart = fileSize - dataSize; - for (unsigned int i = 0; i < numEntries; ++i) + const Files::IStreamPtr stream = Files::openConstrainedFileStream(mCtx.filename.parent_path() / path); + buildLStringIndex(stringType, *stream); + } + + void Reader::buildLStringIndex(LocalizedStringType stringType, std::istream& stream) + { + stream.seekg(0, std::ios::end); + const std::istream::pos_type fileSize = stream.tellg(); + stream.seekg(0, std::ios::beg); + + std::uint32_t numEntries = 0; + stream.read(reinterpret_cast(&numEntries), sizeof(numEntries)); + + std::uint32_t dataSize = 0; + stream.read(reinterpret_cast(&dataSize), sizeof(dataSize)); + + const std::istream::pos_type dataStart = fileSize - static_cast(dataSize); + + struct LocalizedString { - stream->read((char*)&stringId, sizeof(stringId)); - stream->read((char*)&sp.offset, sizeof(sp.offset)); - sp.offset += (std::uint32_t)dataStart; - mLStringIndex[FormId::fromUint32(stringId)] = sp; + std::uint32_t mOffset = 0; + std::uint32_t mStringId = 0; + }; + + std::vector strings; + strings.reserve(numEntries); + + for (std::uint32_t i = 0; i < numEntries; ++i) + { + LocalizedString string; + + stream.read(reinterpret_cast(&string.mStringId), sizeof(string.mStringId)); + stream.read(reinterpret_cast(&string.mOffset), sizeof(string.mOffset)); + + strings.push_back(string); } - // assert (dataStart - stream->tell() == 0 && "String file start of data section mismatch"); + + std::sort(strings.begin(), strings.end(), + [](const LocalizedString& l, const LocalizedString& r) { return l.mOffset < r.mOffset; }); + + std::uint32_t lastOffset = 0; + std::string_view lastValue; + + for (const LocalizedString& string : strings) + { + if (string.mOffset == lastOffset) + { + mLStringIndex.emplace(FormId::fromUint32(string.mStringId), lastValue); + continue; + } + + const std::istream::pos_type offset = string.mOffset + dataStart; + const std::istream::pos_type pos = stream.tellg(); + if (pos != offset) + { + char buffer[4096]; + if (pos < offset && offset - pos < static_cast(sizeof(buffer))) + stream.read(buffer, offset - pos); + else + stream.seekg(offset); + } + + const auto it + = mLStringIndex.emplace(FormId::fromUint32(string.mStringId), readLocalizedString(stringType, stream)) + .first; + lastOffset = string.mOffset; + lastValue = it->second; + } + } + + std::string Reader::readLocalizedString(LocalizedStringType type, std::istream& stream) + { + if (type == LocalizedStringType::Strings) + { + std::string data; + + while (true) + { + char ch = 0; + stream.read(&ch, sizeof(ch)); + if (ch == 0) + break; + data.push_back(ch); + } + + return data; + } + + std::uint32_t size = 0; + stream.read(reinterpret_cast(&size), sizeof(size)); + + std::string result; + getStringImpl(result, size, stream, true); // expect null terminated string + return result; } void Reader::getLocalizedString(std::string& str) @@ -277,48 +358,13 @@ namespace ESM4 // FIXME: very messy and probably slow/inefficient void Reader::getLocalizedStringImpl(const FormId stringId, std::string& str) { - const std::map::const_iterator it = mLStringIndex.find(stringId); + const auto it = mLStringIndex.find(stringId); - if (it != mLStringIndex.end()) - { - std::istream* filestream = nullptr; + if (it == mLStringIndex.end()) + throw std::runtime_error( + "ESM4::Reader::getLocalizedString localized string not found for " + formIdToString(stringId)); - switch (it->second.type) - { - case Type_Strings: // no string size provided - { - filestream = mStrings.get(); - filestream->seekg(it->second.offset); - - char ch; - std::vector data; - do - { - filestream->read(&ch, sizeof(ch)); - data.push_back(ch); - } while (ch != 0); - - str = std::string(data.data()); - return; - } - case Type_ILStrings: - filestream = mILStrings.get(); - break; - case Type_DLStrings: - filestream = mDLStrings.get(); - break; - default: - throw std::runtime_error("ESM4::Reader::getLocalizedString unknown string type"); - } - - // get ILStrings or DLStrings (they provide string size) - filestream->seekg(it->second.offset); - std::uint32_t size = 0; - filestream->read((char*)&size, sizeof(size)); - getStringImpl(str, size, *filestream, mEncoder, true); // expect null terminated string - } - else - throw std::runtime_error("ESM4::Reader::getLocalizedString localized string not found"); + str = it->second; } bool Reader::getRecordHeader() @@ -659,19 +705,18 @@ namespace ESM4 throw std::runtime_error(ss.str()); } - bool Reader::getStringImpl(std::string& str, std::size_t size, std::istream& stream, - const ToUTF8::StatelessUtf8Encoder* encoder, bool hasNull) + bool Reader::getStringImpl(std::string& str, std::size_t size, std::istream& stream, bool hasNull) { std::size_t newSize = size; - if (encoder) + if (mEncoder != nullptr) { std::string input(size, '\0'); stream.read(input.data(), size); if (stream.gcount() == static_cast(size)) { const std::string_view result - = encoder->getUtf8(input, ToUTF8::BufferAllocationPolicy::FitToRequiredSize, str); + = mEncoder->getUtf8(input, ToUTF8::BufferAllocationPolicy::FitToRequiredSize, str); if (str.empty() && !result.empty()) { str = std::move(input); diff --git a/components/esm4/reader.hpp b/components/esm4/reader.hpp index 86a52115e4..2f6dd6bfdf 100644 --- a/components/esm4/reader.hpp +++ b/components/esm4/reader.hpp @@ -131,6 +131,13 @@ namespace ESM4 ReaderContext(); }; + enum class LocalizedStringType + { + Strings, + ILStrings, + DLStrings, + }; + class Reader { VFS::Manager const* mVFS; @@ -149,24 +156,15 @@ namespace ESM4 Files::IStreamPtr mILStrings; Files::IStreamPtr mDLStrings; - enum LocalizedStringType - { - Type_Strings = 0, - Type_ILStrings = 1, - Type_DLStrings = 2 - }; - - struct LStringOffset - { - LocalizedStringType type; - std::uint32_t offset; - }; - - std::map mLStringIndex; + std::unordered_map mLStringIndex; std::vector* mGlobalReaderList = nullptr; - void buildLStringIndex(const std::filesystem::path& stringFile, LocalizedStringType stringType); + void buildLStringIndex(LocalizedStringType stringType, const std::u8string& prefix); + + void buildLStringIndex(LocalizedStringType stringType, std::istream& stream); + + std::string readLocalizedString(LocalizedStringType type, std::istream& stream); inline bool hasLocalizedStrings() const { return (mHeader.mFlags & Rec_Localized) != 0; } @@ -185,11 +183,12 @@ namespace ESM4 Reader() = default; - bool getStringImpl(std::string& str, std::size_t size, std::istream& stream, - const ToUTF8::StatelessUtf8Encoder* encoder, bool hasNull = false); + bool getStringImpl(std::string& str, std::size_t size, std::istream& stream, bool hasNull = false); public: - Reader(Files::IStreamPtr&& esmStream, const std::filesystem::path& filename, VFS::Manager const* vfs = nullptr); + Reader(Files::IStreamPtr&& esmStream, const std::filesystem::path& filename, VFS::Manager const* vfs, + const ToUTF8::StatelessUtf8Encoder* encoder); + ~Reader(); void open(const std::filesystem::path& filename); @@ -198,8 +197,6 @@ namespace ESM4 inline bool isEsm4() const { return true; } - inline void setEncoder(const ToUTF8::StatelessUtf8Encoder* encoder) { mEncoder = encoder; } - const std::vector& getGameFiles() const { return mHeader.mMaster; } inline int getRecordCount() const { return mHeader.mData.records; } @@ -348,14 +345,8 @@ namespace ESM4 void adjustGRUPFormId(); // Note: uses the string size from the subrecord header rather than checking null termination - bool getZString(std::string& str) - { - return getStringImpl(str, mCtx.subRecordHeader.dataSize, *mStream, mEncoder, true); - } - bool getString(std::string& str) - { - return getStringImpl(str, mCtx.subRecordHeader.dataSize, *mStream, mEncoder); - } + bool getZString(std::string& str) { return getStringImpl(str, mCtx.subRecordHeader.dataSize, *mStream, true); } + bool getString(std::string& str) { return getStringImpl(str, mCtx.subRecordHeader.dataSize, *mStream); } bool getZeroTerminatedStringArray(std::vector& values); From e537d1d0d43ef5b824490425db86977c1ea94257 Mon Sep 17 00:00:00 2001 From: elsid Date: Mon, 22 May 2023 20:13:26 +0200 Subject: [PATCH 2/3] Ignore missing localization string files by esmtool Dawnguard.esm from Skyrim depends on files stored in Dawnguard.bsa which is not processed by esmtool. --- apps/esmtool/tes4.cpp | 2 +- components/esm4/reader.cpp | 27 ++++++++++++++++++++++++--- components/esm4/reader.hpp | 4 +++- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/apps/esmtool/tes4.cpp b/apps/esmtool/tes4.cpp index 8760e33dfb..8eaf1b3464 100644 --- a/apps/esmtool/tes4.cpp +++ b/apps/esmtool/tes4.cpp @@ -526,7 +526,7 @@ namespace EsmTool try { const ToUTF8::StatelessUtf8Encoder encoder(ToUTF8::calculateEncoding(info.encoding)); - ESM4::Reader reader(std::move(stream), info.filename, nullptr, &encoder); + ESM4::Reader reader(std::move(stream), info.filename, nullptr, &encoder, true); const Params params(info); if (!params.mQuite) diff --git a/components/esm4/reader.cpp b/components/esm4/reader.cpp index 8ce5b1f4d2..4a3569eeac 100644 --- a/components/esm4/reader.cpp +++ b/components/esm4/reader.cpp @@ -91,11 +91,12 @@ namespace ESM4 } Reader::Reader(Files::IStreamPtr&& esmStream, const std::filesystem::path& filename, VFS::Manager const* vfs, - const ToUTF8::StatelessUtf8Encoder* encoder) + const ToUTF8::StatelessUtf8Encoder* encoder, bool ignoreMissingLocalizedStrings) : mVFS(vfs) , mEncoder(encoder) , mFileSize(0) , mStream(std::move(esmStream)) + , mIgnoreMissingLocalizedStrings(ignoreMissingLocalizedStrings) { // used by ESMReader only? mCtx.filename = filename; @@ -243,12 +244,28 @@ namespace ESM4 if (mVFS != nullptr) { - const Files::IStreamPtr stream = mVFS->get(Files::pathToUnicodeString(path)); + const std::string vfsPath = Files::pathToUnicodeString(path); + + if (mIgnoreMissingLocalizedStrings && !mVFS->exists(vfsPath)) + { + Log(Debug::Warning) << "Ignore missing VFS strings file: " << vfsPath; + return; + } + + const Files::IStreamPtr stream = mVFS->get(vfsPath); buildLStringIndex(stringType, *stream); return; } - const Files::IStreamPtr stream = Files::openConstrainedFileStream(mCtx.filename.parent_path() / path); + const std::filesystem::path fsPath = mCtx.filename.parent_path() / path; + + if (mIgnoreMissingLocalizedStrings && !std::filesystem::exists(fsPath)) + { + Log(Debug::Warning) << "Ignore missing strings file: " << fsPath; + return; + } + + const Files::IStreamPtr stream = Files::openConstrainedFileStream(fsPath); buildLStringIndex(stringType, *stream); } @@ -361,8 +378,12 @@ namespace ESM4 const auto it = mLStringIndex.find(stringId); if (it == mLStringIndex.end()) + { + if (mIgnoreMissingLocalizedStrings) + return; throw std::runtime_error( "ESM4::Reader::getLocalizedString localized string not found for " + formIdToString(stringId)); + } str = it->second; } diff --git a/components/esm4/reader.hpp b/components/esm4/reader.hpp index 2f6dd6bfdf..6b712ae9d8 100644 --- a/components/esm4/reader.hpp +++ b/components/esm4/reader.hpp @@ -160,6 +160,8 @@ namespace ESM4 std::vector* mGlobalReaderList = nullptr; + bool mIgnoreMissingLocalizedStrings = false; + void buildLStringIndex(LocalizedStringType stringType, const std::u8string& prefix); void buildLStringIndex(LocalizedStringType stringType, std::istream& stream); @@ -187,7 +189,7 @@ namespace ESM4 public: Reader(Files::IStreamPtr&& esmStream, const std::filesystem::path& filename, VFS::Manager const* vfs, - const ToUTF8::StatelessUtf8Encoder* encoder); + const ToUTF8::StatelessUtf8Encoder* encoder, bool ignoreMissingLocalizedStrings = false); ~Reader(); From 7c16ecb25831e19860a49632125c56cd17b83055 Mon Sep 17 00:00:00 2001 From: elsid Date: Mon, 22 May 2023 21:51:35 +0200 Subject: [PATCH 3/3] Skip more unknown ESM4 subrecords CNAM from HDPT is a part of Dawnguard.esm. ATKR from RACE is a part of Dawnguard.esm. CIS2 from SCRL is a part of Dragonborn.esm. SPMV from RACE is a part of Dragonborn.esm. --- components/esm4/common.hpp | 1 + components/esm4/loadhdpt.cpp | 1 + components/esm4/loadrace.cpp | 2 ++ components/esm4/loadscrl.cpp | 1 + 4 files changed, 5 insertions(+) diff --git a/components/esm4/common.hpp b/components/esm4/common.hpp index 77c24f6121..61f18e650d 100644 --- a/components/esm4/common.hpp +++ b/components/esm4/common.hpp @@ -571,6 +571,7 @@ namespace ESM4 SUB_FLTR = fourCC("FLTR"), // TES5 SUB_QTGL = fourCC("QTGL"), // TES5 SUB_TWAT = fourCC("TWAT"), // TES5 + SUB_SPMV = fourCC("SPMV"), // TES5 SUB_XIBS = fourCC("XIBS"), // FO3 SUB_REPL = fourCC("REPL"), // FO3 SUB_BIPL = fourCC("BIPL"), // FO3 diff --git a/components/esm4/loadhdpt.cpp b/components/esm4/loadhdpt.cpp index f308a6a3a2..51f4b629f3 100644 --- a/components/esm4/loadhdpt.cpp +++ b/components/esm4/loadhdpt.cpp @@ -94,6 +94,7 @@ void ESM4::HeadPart::load(ESM4::Reader& reader) case ESM4::SUB_MODS: case ESM4::SUB_MODT: case ESM4::SUB_RNAM: + case ESM4::SUB_CNAM: { // std::cout << "HDPT " << ESM::printName(subHdr.typeId) << " skipping..." << std::endl; reader.skipSubRecordData(); diff --git a/components/esm4/loadrace.cpp b/components/esm4/loadrace.cpp index 04f3bd0fc2..1576743ed3 100644 --- a/components/esm4/loadrace.cpp +++ b/components/esm4/loadrace.cpp @@ -677,6 +677,8 @@ void ESM4::Race::load(ESM4::Reader& reader) case ESM4::SUB_SPED: case ESM4::SUB_SWMV: case ESM4::SUB_WKMV: + case ESM4::SUB_SPMV: + case ESM4::SUB_ATKR: // case ESM4::SUB_YNAM: // FO3 case ESM4::SUB_NAM2: // FO3 diff --git a/components/esm4/loadscrl.cpp b/components/esm4/loadscrl.cpp index 30cc8818fd..bc77d80d07 100644 --- a/components/esm4/loadscrl.cpp +++ b/components/esm4/loadscrl.cpp @@ -72,6 +72,7 @@ void ESM4::Scroll::load(ESM4::Reader& reader) case ESM4::SUB_MDOB: case ESM4::SUB_MODT: case ESM4::SUB_SPIT: + case ESM4::SUB_CIS2: { // std::cout << "SCRL " << ESM::printName(subHdr.typeId) << " skipping..." << std::endl; reader.skipSubRecordData();