From 5a1d4b58eaa91bbf845a41286e5b35dffb34c529 Mon Sep 17 00:00:00 2001
From: Wouter van Oortmerssen <aardappel@gmail.com>
Date: Wed, 1 May 2024 10:11:17 -0700
Subject: [PATCH] Added checksum checks for builtins and compiled code

This to ensure when you have a compiled (C++) code build, there are no version mismatches
---
 dev/TODO.txt                         |  9 --------
 dev/src/bytecode.fbs                 |  1 +
 dev/src/compiler.cpp                 | 32 ++++++++++++++++++----------
 dev/src/lobster/bytecode_generated.h | 20 +++++++++++++----
 dev/src/lobster/compiler.h           |  2 +-
 dev/src/lobster/idents.h             |  6 ++++--
 dev/src/lobster/lex.h                |  8 +++++++
 dev/src/lobster/natreg.h             | 11 ++++++++++
 dev/src/main.cpp                     |  6 ++++--
 dev/src/tocpp.cpp                    |  7 +++++-
 10 files changed, 72 insertions(+), 30 deletions(-)
diff --git a/dev/TODO.txt b/dev/TODO.txt
index 0a3fb7013..1c7717382 100644
--- a/dev/TODO.txt
+++ b/dev/TODO.txt
@@ -10,15 +10,6 @@
   "function foo with 2 arguments is ambiguous with the 1 version because of default arguments"
   is super confusing.
 
-- Compute a checksum of all code parsed by a particular build, and stick that both in the .pak file
-  header, and in the FlatBuffers metadata. That way, when doing a C++ build, these two numbers can
-  be checked against eachother to ensure they are from the same build.
-  What is also fragile is the Lobster binary that compiled the bytecode compared to the one compiled into
-  tocpp compiled lobster code: if one has a different set of builtin functions, it ends up calling the
-  wrong function (since the bytecode contains just indices). So should also checksum some aspect of
-  the runtime.. a commit id would be great but hard to add to the built process. Failing that, a checksum
-  of all runtime function names or whatever would be a start.
-
 - Options for speeding up JIT mode.
   Recent benchmarking has shown that while the C++ mode can be pretty fast (within 5x of native C++),
   the JIT is surprisingly slow (sometimes up to 10x slower than C++ mode for pure number crunching).
diff --git a/dev/src/bytecode.fbs b/dev/src/bytecode.fbs
index 86910e54c..f5447dc2c 100644
--- a/dev/src/bytecode.fbs
+++ b/dev/src/bytecode.fbs
@@ -92,6 +92,7 @@ table BytecodeFile {
     ser_ids:[int];
 
     build_info:string;
+    src_hash:ulong;
 }
 
 root_type BytecodeFile;
diff --git a/dev/src/compiler.cpp b/dev/src/compiler.cpp
index 46f660257..425f3b1ef 100644
--- a/dev/src/compiler.cpp
+++ b/dev/src/compiler.cpp
@@ -117,12 +117,13 @@ bool IsCompressed(string_view filename) {
 
 static const uint8_t *magic = (uint8_t *)"LPAK";
 static const size_t magic_size = 4;
-static const size_t header_size = magic_size + sizeof(int64_t) * 3;
+static const size_t header_size = magic_size + sizeof(int64_t) * 4;
 static const char *bcname = "bytecode.lbc";
+static const int64_t current_version = 2;
 
 template <typename T> int64_t LE(T x) { return flatbuffers::EndianScalar((int64_t)x); };
 
-string BuildPakFile(string &pakfile, string &bytecode, set<string> &files) {
+string BuildPakFile(string &pakfile, string &bytecode, set<string> &files, uint64_t src_hash) {
     // All offsets in 64bit, just in-case we ever want pakfiles > 4GB :)
     // Since we're building this in memory, they can only be created by a 64bit build.
     vector<int64_t> filestarts;
@@ -196,7 +197,9 @@ string BuildPakFile(string &pakfile, string &bytecode, set<string> &files) {
     auto num = LE(filestarts.size());
     // Finally the "header" (or do we call this a "tailer" ? ;)
     auto header_start = pakfile.size();
-    auto version = LE(1);
+    auto version = LE(current_version);
+    auto src_hash_le = LE(src_hash);
+    pakfile.insert(pakfile.end(), (uint8_t *)&src_hash_le, (uint8_t *)(&src_hash_le + 1));
     pakfile.insert(pakfile.end(), (uint8_t *)&num, (uint8_t *)(&num + 1));
     pakfile.insert(pakfile.end(), (uint8_t *)&dirstart, (uint8_t *)(&dirstart + 1));
     pakfile.insert(pakfile.end(), (uint8_t *)&version, (uint8_t *)(&version + 1));
@@ -208,7 +211,7 @@ string BuildPakFile(string &pakfile, string &bytecode, set<string> &files) {
 
 // This just loads the directory part of a pakfile such that subsequent LoadFile calls know how
 // to load from it.
-bool LoadPakDir(const char *lpak) {
+bool LoadPakDir(const char *lpak, uint64_t &src_hash_dest) {
     // This supports reading from a pakfile > 4GB even on a 32bit system! (as long as individual
     // files in it are <= 4GB).
     auto plen = LoadFile(lpak, nullptr, 0, 0);
@@ -221,10 +224,11 @@ bool LoadPakDir(const char *lpak) {
         memcpy(&r, p, sizeof(int64_t));
         return LE(r);
     };
-    auto num = read_unaligned64(header.c_str());
-    auto dirstart = read_unaligned64((int64_t *)header.c_str() + 1);
-    auto version = read_unaligned64((int64_t *)header.c_str() + 2);
-    if (version > 1) return false;
+    auto src_hash = (uint64_t)read_unaligned64((int64_t *)header.c_str());
+    auto num = read_unaligned64((int64_t *)header.c_str() + 1);
+    auto dirstart = read_unaligned64((int64_t *)header.c_str() + 2);
+    auto version = read_unaligned64((int64_t *)header.c_str() + 3);
+    if (version != current_version) return false;
     if (dirstart > plen) return false;
     string dir;
     if (LoadFile(lpak, &dir, dirstart, plen - dirstart - (int64_t)header_size) < 0)
@@ -240,6 +244,7 @@ bool LoadPakDir(const char *lpak) {
         LOG_INFO("pakfile dir: ", name, " : ", len);
         AddPakFileEntry(lpak, name, off, len, read_unaligned64(uncompressed + i));
     }
+    src_hash_dest = src_hash;
     return true;
 }
 
@@ -387,10 +392,11 @@ void Compile(NativeRegistry &nfr, string_view fn, string_view stringsource, stri
     Optimizer opt(parser, st, tc, runtime_checks);
     if (parsedump) *parsedump = parser.DumpAll(true);
     CodeGen cg(parser, st, return_value, runtime_checks);
+    auto src_hash = lex.HashAll();
     st.Serialize(cg.code, cg.type_table, cg.lineinfo, cg.sids, cg.stringtable, bytecode, cg.vtables,
-                 filenames, cg.ser_ids);
+                 filenames, cg.ser_ids, src_hash);
     if (pakfile) {
-        auto err = BuildPakFile(*pakfile, bytecode, parser.pakfiles);
+        auto err = BuildPakFile(*pakfile, bytecode, parser.pakfiles, src_hash);
         if (!err.empty()) THROW_OR_ABORT(err);
     }
 }
@@ -529,7 +535,8 @@ extern "C" int RunCompiledCodeMain(int argc, const char *const *argv, const uint
         min_output_level = OUTPUT_WARN;
         InitPlatform(GetMainDirFromExePath(argv[0]), aux_src_path, false, loader);
         auto from_lpak = true;
-        if (!LoadPakDir("default.lpak")) {
+        uint64_t src_hash = 0;
+        if (!LoadPakDir("default.lpak", src_hash)) {
             // FIXME: this is optional, we don't know if the compiled code wants to load this
             // file, so we don't error or even warn if this file can't be found.
             from_lpak = false;
@@ -548,6 +555,9 @@ extern "C" int RunCompiledCodeMain(int argc, const char *const *argv, const uint
         };
         for (int arg = 1; arg < argc; arg++) { vmargs.program_args.push_back(argv[arg]); }
         lobster::VMAllocator vma(std::move(vmargs));
+        if (from_lpak && src_hash != vma.vm->bcf->src_hash()) {
+            THROW_OR_ABORT("lpak file from different version of the source code than the compiled code");
+        }
         vma.vm->EvalProgram();
     }
     #ifdef USE_EXCEPTION_HANDLING
diff --git a/dev/src/lobster/bytecode_generated.h b/dev/src/lobster/bytecode_generated.h
index df6f73ab5..2409e0381 100644
--- a/dev/src/lobster/bytecode_generated.h
+++ b/dev/src/lobster/bytecode_generated.h
@@ -579,7 +579,8 @@ struct BytecodeFile FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
     VT_ENUMS = 32,
     VT_VTABLES = 34,
     VT_SER_IDS = 38,
-    VT_BUILD_INFO = 40
+    VT_BUILD_INFO = 40,
+    VT_SRC_HASH = 42
   };
   int32_t bytecode_version() const {
     return GetField<int32_t>(VT_BYTECODE_VERSION, 0);
@@ -623,6 +624,9 @@ struct BytecodeFile FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   const ::flatbuffers::String *build_info() const {
     return GetPointer<const ::flatbuffers::String *>(VT_BUILD_INFO);
   }
+  uint64_t src_hash() const {
+    return GetField<uint64_t>(VT_SRC_HASH, 0);
+  }
   bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_BYTECODE_VERSION, 4) &&
@@ -658,6 +662,7 @@ struct BytecodeFile FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
            verifier.VerifyVector(ser_ids()) &&
            VerifyOffset(verifier, VT_BUILD_INFO) &&
            verifier.VerifyString(build_info()) &&
+           VerifyField<uint64_t>(verifier, VT_SRC_HASH, 8) &&
            verifier.EndTable();
   }
 };
@@ -708,6 +713,9 @@ struct BytecodeFileBuilder {
   void add_build_info(::flatbuffers::Offset<::flatbuffers::String> build_info) {
     fbb_.AddOffset(BytecodeFile::VT_BUILD_INFO, build_info);
   }
+  void add_src_hash(uint64_t src_hash) {
+    fbb_.AddElement<uint64_t>(BytecodeFile::VT_SRC_HASH, src_hash, 0);
+  }
   explicit BytecodeFileBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -734,8 +742,10 @@ inline ::flatbuffers::Offset<BytecodeFile> CreateBytecodeFile(
     ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<bytecode::Enum>>> enums = 0,
     ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> vtables = 0,
     ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> ser_ids = 0,
-    ::flatbuffers::Offset<::flatbuffers::String> build_info = 0) {
+    ::flatbuffers::Offset<::flatbuffers::String> build_info = 0,
+    uint64_t src_hash = 0) {
   BytecodeFileBuilder builder_(_fbb);
+  builder_.add_src_hash(src_hash);
   builder_.add_build_info(build_info);
   builder_.add_ser_ids(ser_ids);
   builder_.add_vtables(vtables);
@@ -768,7 +778,8 @@ inline ::flatbuffers::Offset<BytecodeFile> CreateBytecodeFileDirect(
     const std::vector<::flatbuffers::Offset<bytecode::Enum>> *enums = nullptr,
     const std::vector<int32_t> *vtables = nullptr,
     const std::vector<int32_t> *ser_ids = nullptr,
-    const char *build_info = nullptr) {
+    const char *build_info = nullptr,
+    uint64_t src_hash = 0) {
   auto bytecode__ = bytecode ? _fbb.CreateVector<int32_t>(*bytecode) : 0;
   auto typetable__ = typetable ? _fbb.CreateVector<int32_t>(*typetable) : 0;
   auto stringtable__ = stringtable ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*stringtable) : 0;
@@ -797,7 +808,8 @@ inline ::flatbuffers::Offset<BytecodeFile> CreateBytecodeFileDirect(
       enums__,
       vtables__,
       ser_ids__,
-      build_info__);
+      build_info__,
+      src_hash);
 }
 
 inline const bytecode::BytecodeFile *GetBytecodeFile(const void *buf) {
diff --git a/dev/src/lobster/compiler.h b/dev/src/lobster/compiler.h
index 5706074ea..a4af6d8ee 100644
--- a/dev/src/lobster/compiler.h
+++ b/dev/src/lobster/compiler.h
@@ -34,7 +34,7 @@ extern pair<string, iint> RunTCC(NativeRegistry &nfr,
                           int runtime_checks,
                           bool dump_leaks);
 
-extern bool LoadPakDir(const char *lpak);
+extern bool LoadPakDir(const char *lpak, uint64_t &src_hash_dest);
 extern bool LoadByteCode(string &bytecode);
 extern void RegisterBuiltin(NativeRegistry &natreg, const char *ns, const char *name,
                             void (* regfun)(NativeRegistry &));
diff --git a/dev/src/lobster/idents.h b/dev/src/lobster/idents.h
index 8829f8430..2e34eeec5 100644
--- a/dev/src/lobster/idents.h
+++ b/dev/src/lobster/idents.h
@@ -1286,7 +1286,8 @@ struct SymbolTable {
                    string &bytecode,
                    vector<int> &vtables,
                    vector<pair<string, string>> &filenames,
-                   vector<type_elem_t> &ser_ids) {
+                   vector<type_elem_t> &ser_ids,
+                   uint64_t src_hash) {
         flatbuffers::FlatBufferBuilder fbb;
         // Always serialize this first! that way it can easily be left out of the generated C code.
         auto codevec = fbb.CreateVector(code);
@@ -1327,7 +1328,8 @@ struct SymbolTable {
             fbb.CreateVector(enumoffsets),
             fbb.CreateVector(vtables),
             fbb.CreateVector((vector<int> &)ser_ids),
-            fbb.CreateString(build_info.c_str(), build_info.size()));
+            fbb.CreateString(build_info.c_str(), build_info.size()),
+            src_hash);
         bytecode::FinishBytecodeFileBuffer(fbb, bcf);
         bytecode.assign(fbb.GetBufferPointer(), fbb.GetBufferPointer() + fbb.GetSize());
     }
diff --git a/dev/src/lobster/lex.h b/dev/src/lobster/lex.h
index 992dd473f..38b2ec972 100644
--- a/dev/src/lobster/lex.h
+++ b/dev/src/lobster/lex.h
@@ -96,6 +96,14 @@ struct Lex : LoadedFile {
         FirstToken();
     }
 
+    uint64_t HashAll() {
+        uint64_t h = 0xABADCAFEDEADBEEF;
+        for (auto &src : allsources) {
+            h ^= FNV1A64(*src.get());
+        }
+        return h;
+    }
+
     void FirstToken() {
         Next();
         if (token == T_LINEFEED) Next();
diff --git a/dev/src/lobster/natreg.h b/dev/src/lobster/natreg.h
index f3fb092cf..eea39221f 100644
--- a/dev/src/lobster/natreg.h
+++ b/dev/src/lobster/natreg.h
@@ -479,6 +479,17 @@ struct NativeRegistry {
         auto it = nfunlookup.find(name);
         return it != nfunlookup.end() ? it->second : nullptr;
     }
+
+    uint64_t HashAll() {
+        uint64_t h = 0xABADCAFEDEADBEEF;
+        for (auto nf : nfuns) {
+            h ^= FNV1A64(nf->name);
+            for (auto &a : nf->args) {
+                h ^= FNV1A64(a.name);
+            }
+        }
+        return h;
+    }
 };
 
 struct Line {
diff --git a/dev/src/main.cpp b/dev/src/main.cpp
index 5b8f9234e..1a9698d07 100644
--- a/dev/src/main.cpp
+++ b/dev/src/main.cpp
@@ -205,8 +205,10 @@ int main(int argc, char* argv[]) {
 
         string bytecode_buffer;
         if (fn.empty()) {
-            if (!LoadPakDir(default_lpak))
-                THROW_OR_ABORT("Lobster programming language compiler/runtime (version "
+            uint64_t src_hash = 0;  // Don't care, from same file as bytecode.
+            if (!LoadPakDir(default_lpak, src_hash))
+                THROW_OR_ABORT(
+                    "Lobster programming language compiler/runtime (version "
                                GIT_COMMIT_INFOSTR ")\nno arguments given - cannot load "
                                + (default_lpak + helptext));
             // This will now come from the pakfile.
diff --git a/dev/src/tocpp.cpp b/dev/src/tocpp.cpp
index b398b21b2..600d219b6 100644
--- a/dev/src/tocpp.cpp
+++ b/dev/src/tocpp.cpp
@@ -560,7 +560,12 @@ string ToCPP(NativeRegistry &natreg, string &sd, string_view bytecode_buffer, bo
     }
     if (cpp) sd += "extern \"C\" ";
     sd += "void compiled_entry_point(VMRef vm, StackPtr sp) {\n";
-    if (!cpp) sd += "    Entry(sizeof(Value));\n";
+    if (cpp) {
+        append(sd, "    if (vm.nfr.HashAll() != ", natreg.HashAll(),
+               ") vm.BuiltinError(\"code compiled with mismatching builtin function library\");\n");
+    } else {
+        sd += "    Entry(sizeof(Value));\n";
+    }
     append(sd, "    fun_", starting_point, "(vm, sp);\n}\n\n");
     if (cpp) {
         sd += "int main(int argc, char *argv[]) {\n";