From 5a1d4b58eaa91bbf845a41286e5b35dffb34c529 Mon Sep 17 00:00:00 2001 From: Wouter van Oortmerssen Date: Wed, 1 May 2024 10:11:17 -0700 Subject: [PATCH] Added checksum checks for builtins and compiled code This to ensure when you have a compiled (C++) code build, there are no version mismatches --- dev/TODO.txt | 9 -------- dev/src/bytecode.fbs | 1 + dev/src/compiler.cpp | 32 ++++++++++++++++++---------- dev/src/lobster/bytecode_generated.h | 20 +++++++++++++---- dev/src/lobster/compiler.h | 2 +- dev/src/lobster/idents.h | 6 ++++-- dev/src/lobster/lex.h | 8 +++++++ dev/src/lobster/natreg.h | 11 ++++++++++ dev/src/main.cpp | 6 ++++-- dev/src/tocpp.cpp | 7 +++++- 10 files changed, 72 insertions(+), 30 deletions(-) diff --git a/dev/TODO.txt b/dev/TODO.txt index 0a3fb7013..1c7717382 100644 --- a/dev/TODO.txt +++ b/dev/TODO.txt @@ -10,15 +10,6 @@ "function foo with 2 arguments is ambiguous with the 1 version because of default arguments" is super confusing. -- Compute a checksum of all code parsed by a particular build, and stick that both in the .pak file - header, and in the FlatBuffers metadata. That way, when doing a C++ build, these two numbers can - be checked against eachother to ensure they are from the same build. - What is also fragile is the Lobster binary that compiled the bytecode compared to the one compiled into - tocpp compiled lobster code: if one has a different set of builtin functions, it ends up calling the - wrong function (since the bytecode contains just indices). So should also checksum some aspect of - the runtime.. a commit id would be great but hard to add to the built process. Failing that, a checksum - of all runtime function names or whatever would be a start. - - Options for speeding up JIT mode. Recent benchmarking has shown that while the C++ mode can be pretty fast (within 5x of native C++), the JIT is surprisingly slow (sometimes up to 10x slower than C++ mode for pure number crunching). diff --git a/dev/src/bytecode.fbs b/dev/src/bytecode.fbs index 86910e54c..f5447dc2c 100644 --- a/dev/src/bytecode.fbs +++ b/dev/src/bytecode.fbs @@ -92,6 +92,7 @@ table BytecodeFile { ser_ids:[int]; build_info:string; + src_hash:ulong; } root_type BytecodeFile; diff --git a/dev/src/compiler.cpp b/dev/src/compiler.cpp index 46f660257..425f3b1ef 100644 --- a/dev/src/compiler.cpp +++ b/dev/src/compiler.cpp @@ -117,12 +117,13 @@ bool IsCompressed(string_view filename) { static const uint8_t *magic = (uint8_t *)"LPAK"; static const size_t magic_size = 4; -static const size_t header_size = magic_size + sizeof(int64_t) * 3; +static const size_t header_size = magic_size + sizeof(int64_t) * 4; static const char *bcname = "bytecode.lbc"; +static const int64_t current_version = 2; template int64_t LE(T x) { return flatbuffers::EndianScalar((int64_t)x); }; -string BuildPakFile(string &pakfile, string &bytecode, set &files) { +string BuildPakFile(string &pakfile, string &bytecode, set &files, uint64_t src_hash) { // All offsets in 64bit, just in-case we ever want pakfiles > 4GB :) // Since we're building this in memory, they can only be created by a 64bit build. vector filestarts; @@ -196,7 +197,9 @@ string BuildPakFile(string &pakfile, string &bytecode, set &files) { auto num = LE(filestarts.size()); // Finally the "header" (or do we call this a "tailer" ? ;) auto header_start = pakfile.size(); - auto version = LE(1); + auto version = LE(current_version); + auto src_hash_le = LE(src_hash); + pakfile.insert(pakfile.end(), (uint8_t *)&src_hash_le, (uint8_t *)(&src_hash_le + 1)); pakfile.insert(pakfile.end(), (uint8_t *)&num, (uint8_t *)(&num + 1)); pakfile.insert(pakfile.end(), (uint8_t *)&dirstart, (uint8_t *)(&dirstart + 1)); pakfile.insert(pakfile.end(), (uint8_t *)&version, (uint8_t *)(&version + 1)); @@ -208,7 +211,7 @@ string BuildPakFile(string &pakfile, string &bytecode, set &files) { // This just loads the directory part of a pakfile such that subsequent LoadFile calls know how // to load from it. -bool LoadPakDir(const char *lpak) { +bool LoadPakDir(const char *lpak, uint64_t &src_hash_dest) { // This supports reading from a pakfile > 4GB even on a 32bit system! (as long as individual // files in it are <= 4GB). auto plen = LoadFile(lpak, nullptr, 0, 0); @@ -221,10 +224,11 @@ bool LoadPakDir(const char *lpak) { memcpy(&r, p, sizeof(int64_t)); return LE(r); }; - auto num = read_unaligned64(header.c_str()); - auto dirstart = read_unaligned64((int64_t *)header.c_str() + 1); - auto version = read_unaligned64((int64_t *)header.c_str() + 2); - if (version > 1) return false; + auto src_hash = (uint64_t)read_unaligned64((int64_t *)header.c_str()); + auto num = read_unaligned64((int64_t *)header.c_str() + 1); + auto dirstart = read_unaligned64((int64_t *)header.c_str() + 2); + auto version = read_unaligned64((int64_t *)header.c_str() + 3); + if (version != current_version) return false; if (dirstart > plen) return false; string dir; if (LoadFile(lpak, &dir, dirstart, plen - dirstart - (int64_t)header_size) < 0) @@ -240,6 +244,7 @@ bool LoadPakDir(const char *lpak) { LOG_INFO("pakfile dir: ", name, " : ", len); AddPakFileEntry(lpak, name, off, len, read_unaligned64(uncompressed + i)); } + src_hash_dest = src_hash; return true; } @@ -387,10 +392,11 @@ void Compile(NativeRegistry &nfr, string_view fn, string_view stringsource, stri Optimizer opt(parser, st, tc, runtime_checks); if (parsedump) *parsedump = parser.DumpAll(true); CodeGen cg(parser, st, return_value, runtime_checks); + auto src_hash = lex.HashAll(); st.Serialize(cg.code, cg.type_table, cg.lineinfo, cg.sids, cg.stringtable, bytecode, cg.vtables, - filenames, cg.ser_ids); + filenames, cg.ser_ids, src_hash); if (pakfile) { - auto err = BuildPakFile(*pakfile, bytecode, parser.pakfiles); + auto err = BuildPakFile(*pakfile, bytecode, parser.pakfiles, src_hash); if (!err.empty()) THROW_OR_ABORT(err); } } @@ -529,7 +535,8 @@ extern "C" int RunCompiledCodeMain(int argc, const char *const *argv, const uint min_output_level = OUTPUT_WARN; InitPlatform(GetMainDirFromExePath(argv[0]), aux_src_path, false, loader); auto from_lpak = true; - if (!LoadPakDir("default.lpak")) { + uint64_t src_hash = 0; + if (!LoadPakDir("default.lpak", src_hash)) { // FIXME: this is optional, we don't know if the compiled code wants to load this // file, so we don't error or even warn if this file can't be found. from_lpak = false; @@ -548,6 +555,9 @@ extern "C" int RunCompiledCodeMain(int argc, const char *const *argv, const uint }; for (int arg = 1; arg < argc; arg++) { vmargs.program_args.push_back(argv[arg]); } lobster::VMAllocator vma(std::move(vmargs)); + if (from_lpak && src_hash != vma.vm->bcf->src_hash()) { + THROW_OR_ABORT("lpak file from different version of the source code than the compiled code"); + } vma.vm->EvalProgram(); } #ifdef USE_EXCEPTION_HANDLING diff --git a/dev/src/lobster/bytecode_generated.h b/dev/src/lobster/bytecode_generated.h index df6f73ab5..2409e0381 100644 --- a/dev/src/lobster/bytecode_generated.h +++ b/dev/src/lobster/bytecode_generated.h @@ -579,7 +579,8 @@ struct BytecodeFile FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { VT_ENUMS = 32, VT_VTABLES = 34, VT_SER_IDS = 38, - VT_BUILD_INFO = 40 + VT_BUILD_INFO = 40, + VT_SRC_HASH = 42 }; int32_t bytecode_version() const { return GetField(VT_BYTECODE_VERSION, 0); @@ -623,6 +624,9 @@ struct BytecodeFile FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { const ::flatbuffers::String *build_info() const { return GetPointer(VT_BUILD_INFO); } + uint64_t src_hash() const { + return GetField(VT_SRC_HASH, 0); + } bool Verify(::flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyField(verifier, VT_BYTECODE_VERSION, 4) && @@ -658,6 +662,7 @@ struct BytecodeFile FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { verifier.VerifyVector(ser_ids()) && VerifyOffset(verifier, VT_BUILD_INFO) && verifier.VerifyString(build_info()) && + VerifyField(verifier, VT_SRC_HASH, 8) && verifier.EndTable(); } }; @@ -708,6 +713,9 @@ struct BytecodeFileBuilder { void add_build_info(::flatbuffers::Offset<::flatbuffers::String> build_info) { fbb_.AddOffset(BytecodeFile::VT_BUILD_INFO, build_info); } + void add_src_hash(uint64_t src_hash) { + fbb_.AddElement(BytecodeFile::VT_SRC_HASH, src_hash, 0); + } explicit BytecodeFileBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); @@ -734,8 +742,10 @@ inline ::flatbuffers::Offset CreateBytecodeFile( ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> enums = 0, ::flatbuffers::Offset<::flatbuffers::Vector> vtables = 0, ::flatbuffers::Offset<::flatbuffers::Vector> ser_ids = 0, - ::flatbuffers::Offset<::flatbuffers::String> build_info = 0) { + ::flatbuffers::Offset<::flatbuffers::String> build_info = 0, + uint64_t src_hash = 0) { BytecodeFileBuilder builder_(_fbb); + builder_.add_src_hash(src_hash); builder_.add_build_info(build_info); builder_.add_ser_ids(ser_ids); builder_.add_vtables(vtables); @@ -768,7 +778,8 @@ inline ::flatbuffers::Offset CreateBytecodeFileDirect( const std::vector<::flatbuffers::Offset> *enums = nullptr, const std::vector *vtables = nullptr, const std::vector *ser_ids = nullptr, - const char *build_info = nullptr) { + const char *build_info = nullptr, + uint64_t src_hash = 0) { auto bytecode__ = bytecode ? _fbb.CreateVector(*bytecode) : 0; auto typetable__ = typetable ? _fbb.CreateVector(*typetable) : 0; auto stringtable__ = stringtable ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*stringtable) : 0; @@ -797,7 +808,8 @@ inline ::flatbuffers::Offset CreateBytecodeFileDirect( enums__, vtables__, ser_ids__, - build_info__); + build_info__, + src_hash); } inline const bytecode::BytecodeFile *GetBytecodeFile(const void *buf) { diff --git a/dev/src/lobster/compiler.h b/dev/src/lobster/compiler.h index 5706074ea..a4af6d8ee 100644 --- a/dev/src/lobster/compiler.h +++ b/dev/src/lobster/compiler.h @@ -34,7 +34,7 @@ extern pair RunTCC(NativeRegistry &nfr, int runtime_checks, bool dump_leaks); -extern bool LoadPakDir(const char *lpak); +extern bool LoadPakDir(const char *lpak, uint64_t &src_hash_dest); extern bool LoadByteCode(string &bytecode); extern void RegisterBuiltin(NativeRegistry &natreg, const char *ns, const char *name, void (* regfun)(NativeRegistry &)); diff --git a/dev/src/lobster/idents.h b/dev/src/lobster/idents.h index 8829f8430..2e34eeec5 100644 --- a/dev/src/lobster/idents.h +++ b/dev/src/lobster/idents.h @@ -1286,7 +1286,8 @@ struct SymbolTable { string &bytecode, vector &vtables, vector> &filenames, - vector &ser_ids) { + vector &ser_ids, + uint64_t src_hash) { flatbuffers::FlatBufferBuilder fbb; // Always serialize this first! that way it can easily be left out of the generated C code. auto codevec = fbb.CreateVector(code); @@ -1327,7 +1328,8 @@ struct SymbolTable { fbb.CreateVector(enumoffsets), fbb.CreateVector(vtables), fbb.CreateVector((vector &)ser_ids), - fbb.CreateString(build_info.c_str(), build_info.size())); + fbb.CreateString(build_info.c_str(), build_info.size()), + src_hash); bytecode::FinishBytecodeFileBuffer(fbb, bcf); bytecode.assign(fbb.GetBufferPointer(), fbb.GetBufferPointer() + fbb.GetSize()); } diff --git a/dev/src/lobster/lex.h b/dev/src/lobster/lex.h index 992dd473f..38b2ec972 100644 --- a/dev/src/lobster/lex.h +++ b/dev/src/lobster/lex.h @@ -96,6 +96,14 @@ struct Lex : LoadedFile { FirstToken(); } + uint64_t HashAll() { + uint64_t h = 0xABADCAFEDEADBEEF; + for (auto &src : allsources) { + h ^= FNV1A64(*src.get()); + } + return h; + } + void FirstToken() { Next(); if (token == T_LINEFEED) Next(); diff --git a/dev/src/lobster/natreg.h b/dev/src/lobster/natreg.h index f3fb092cf..eea39221f 100644 --- a/dev/src/lobster/natreg.h +++ b/dev/src/lobster/natreg.h @@ -479,6 +479,17 @@ struct NativeRegistry { auto it = nfunlookup.find(name); return it != nfunlookup.end() ? it->second : nullptr; } + + uint64_t HashAll() { + uint64_t h = 0xABADCAFEDEADBEEF; + for (auto nf : nfuns) { + h ^= FNV1A64(nf->name); + for (auto &a : nf->args) { + h ^= FNV1A64(a.name); + } + } + return h; + } }; struct Line { diff --git a/dev/src/main.cpp b/dev/src/main.cpp index 5b8f9234e..1a9698d07 100644 --- a/dev/src/main.cpp +++ b/dev/src/main.cpp @@ -205,8 +205,10 @@ int main(int argc, char* argv[]) { string bytecode_buffer; if (fn.empty()) { - if (!LoadPakDir(default_lpak)) - THROW_OR_ABORT("Lobster programming language compiler/runtime (version " + uint64_t src_hash = 0; // Don't care, from same file as bytecode. + if (!LoadPakDir(default_lpak, src_hash)) + THROW_OR_ABORT( + "Lobster programming language compiler/runtime (version " GIT_COMMIT_INFOSTR ")\nno arguments given - cannot load " + (default_lpak + helptext)); // This will now come from the pakfile. diff --git a/dev/src/tocpp.cpp b/dev/src/tocpp.cpp index b398b21b2..600d219b6 100644 --- a/dev/src/tocpp.cpp +++ b/dev/src/tocpp.cpp @@ -560,7 +560,12 @@ string ToCPP(NativeRegistry &natreg, string &sd, string_view bytecode_buffer, bo } if (cpp) sd += "extern \"C\" "; sd += "void compiled_entry_point(VMRef vm, StackPtr sp) {\n"; - if (!cpp) sd += " Entry(sizeof(Value));\n"; + if (cpp) { + append(sd, " if (vm.nfr.HashAll() != ", natreg.HashAll(), + ") vm.BuiltinError(\"code compiled with mismatching builtin function library\");\n"); + } else { + sd += " Entry(sizeof(Value));\n"; + } append(sd, " fun_", starting_point, "(vm, sp);\n}\n\n"); if (cpp) { sd += "int main(int argc, char *argv[]) {\n";