From 803d9aec77f18496007d493d7ad3b60bc15c734f Mon Sep 17 00:00:00 2001 From: Michael Schaffner Date: Tue, 16 Jan 2024 16:58:27 -0800 Subject: [PATCH] [prim, rom_ctrl] Remove S&P layer from data scrambling As elaborated on #20788, the S&P layer is disabled in the SRAM scrambling devices in order to improve error detection guarantees, interactions with ECC and timing. In order to minimize changes and keep the implementation around in case it is needed for byte parity at some point, we just set the NumDiffRounds parameter to zero for the modules that leverage prim_ram_1p_scr. In case of rom_ctrl, the functionality is removed entirely. Signed-off-by: Michael Schaffner --- hw/dv/sv/mem_bkdr_util/mem_bkdr_util__rom.sv | 1 - hw/dv/sv/mem_bkdr_util/sram_scrambler_pkg.sv | 119 ++++++++++-------- .../verilator/cpp/scrambled_ecc32_mem_area.cc | 10 +- hw/ip/otbn/rtl/otbn.sv | 4 +- .../dv/prim_ram_scr/cpp/scramble_model.cc | 30 +++-- .../prim/dv/prim_ram_scr/cpp/scramble_model.h | 10 +- hw/ip/prim/rtl/prim_ram_1p_scr.sv | 7 +- hw/ip/rom_ctrl/doc/rom_ctrl_blockdiag.svg | 57 +++------ hw/ip/rom_ctrl/doc/theory_of_operation.md | 2 +- hw/ip/rom_ctrl/rtl/rom_ctrl_scrambled_rom.sv | 16 +-- hw/ip/rom_ctrl/util/scramble_image.py | 17 +-- hw/ip/sram_ctrl/doc/sram_ctrl_blockdiag.svg | 2 +- hw/ip/sram_ctrl/doc/theory_of_operation.md | 5 +- hw/ip/sram_ctrl/rtl/sram_ctrl.sv | 3 +- 14 files changed, 127 insertions(+), 156 deletions(-) diff --git a/hw/dv/sv/mem_bkdr_util/mem_bkdr_util__rom.sv b/hw/dv/sv/mem_bkdr_util/mem_bkdr_util__rom.sv index d021ca569d10c..e96c15bc4171f 100644 --- a/hw/dv/sv/mem_bkdr_util/mem_bkdr_util__rom.sv +++ b/hw/dv/sv/mem_bkdr_util/mem_bkdr_util__rom.sv @@ -59,7 +59,6 @@ virtual function bit [38:0] rom_encrypt_read32(bit [bus_params_pkg::BUS_AW-1:0] zero_key[i] = '0; end - data_arr = sram_scrambler_pkg::sp_decrypt(data_arr, 39, zero_key); for (int i = 0; i < 39; i++) begin data[i] = data_arr[i] ^ keystream[i]; end diff --git a/hw/dv/sv/mem_bkdr_util/sram_scrambler_pkg.sv b/hw/dv/sv/mem_bkdr_util/sram_scrambler_pkg.sv index f832a5f5b319b..69d3c94668c48 100644 --- a/hw/dv/sv/mem_bkdr_util/sram_scrambler_pkg.sv +++ b/hw/dv/sv/mem_bkdr_util/sram_scrambler_pkg.sv @@ -237,10 +237,10 @@ package sram_scrambler_pkg; // SRAM data encryption is more involved, we need to run 2 rounds of PRINCE on the nonce and key // and then XOR the result with the data. // - // After that, the XORed data neeeds to them be passed through the S&P network one byte at a time. + // Optionally, the XORed data can be passed through the S&P network. function automatic state_t encrypt_sram_data(logic data[], int data_width, int sp_width, logic addr[], int addr_width, - logic key[], logic nonce[]); + logic key[], logic nonce[], bit use_sp_layer = 0); logic keystream[] = new[SRAM_BLOCK_WIDTH]; logic data_enc[] = new[data_width]; logic byte_to_enc[] = new[8]; @@ -262,31 +262,33 @@ package sram_scrambler_pkg; data_enc[i] = data[i] ^ keystream[i % ks_width]; end - if (data_width == sp_width) begin - // pass the entire word through the subst/perm network at once (the next cases would give the - // same results too, but this should be a bit more efficient) - data_enc = sp_encrypt(data_enc, data_width, zero_key); - end else if (sp_width == 8) begin - // pass each byte of the encoded result through the subst/perm network (special case of the - // general code below) - for (int i = 0; i < data_width / 8; i++) begin - byte_to_enc = data_enc[i*8 +: 8]; - enc_byte = sp_encrypt(byte_to_enc, 8, zero_key); - data_enc[i*8 +: 8] = enc_byte; - end - end else begin - // divide the word into sp_width chunks to pass it through the subst/perm network - for (int chunk_lsb = 0; chunk_lsb < data_width; chunk_lsb += sp_width) begin - int bits_remaining = data_width - chunk_lsb; - int chunk_width = (bits_remaining < sp_width) ? bits_remaining : sp_width; - logic chunk[] = new[chunk_width]; - - for (int j = 0; j < chunk_width; j++) begin - chunk[j] = data_enc[chunk_lsb + j]; + if (use_sp_layer) begin + if (data_width == sp_width) begin + // pass the entire word through the subst/perm network at once (the next cases would give the + // same results too, but this should be a bit more efficient) + data_enc = sp_encrypt(data_enc, data_width, zero_key); + end else if (sp_width == 8) begin + // pass each byte of the encoded result through the subst/perm network (special case of the + // general code below) + for (int i = 0; i < data_width / 8; i++) begin + byte_to_enc = data_enc[i*8 +: 8]; + enc_byte = sp_encrypt(byte_to_enc, 8, zero_key); + data_enc[i*8 +: 8] = enc_byte; end - chunk = sp_encrypt(chunk, chunk_width, zero_key); - for (int j = 0; j < chunk_width; j++) begin - data_enc[chunk_lsb + j] = chunk[j]; + end else begin + // divide the word into sp_width chunks to pass it through the subst/perm network + for (int chunk_lsb = 0; chunk_lsb < data_width; chunk_lsb += sp_width) begin + int bits_remaining = data_width - chunk_lsb; + int chunk_width = (bits_remaining < sp_width) ? bits_remaining : sp_width; + logic chunk[] = new[chunk_width]; + + for (int j = 0; j < chunk_width; j++) begin + chunk[j] = data_enc[chunk_lsb + j]; + end + chunk = sp_encrypt(chunk, chunk_width, zero_key); + for (int j = 0; j < chunk_width; j++) begin + data_enc[chunk_lsb + j] = chunk[j]; + end end end end @@ -296,7 +298,7 @@ package sram_scrambler_pkg; function automatic state_t decrypt_sram_data(logic data[], int data_width, int sp_width, logic addr[], int addr_width, - logic key[], logic nonce[]); + logic key[], logic nonce[], bit use_sp_layer = 0); logic keystream[] = new[SRAM_BLOCK_WIDTH]; logic data_dec[] = new[data_width]; logic byte_to_dec[] = new[8]; @@ -312,38 +314,45 @@ package sram_scrambler_pkg; // Generate the keystream keystream = gen_keystream(addr, addr_width, key, nonce); - if (data_width == sp_width) begin - // pass the entire word through the subst/perm network at once (the next cases would give the - // same results too, but this should be a bit more efficient) - data_dec = sp_decrypt(data, data_width, zero_key); - end else if (sp_width == 8) begin - // pass each byte of the data through the subst/perm network (special case of the general code - // below) - for (int i = 0; i < data_width / 8; i++) begin - byte_to_dec = data[i*8 +: 8]; - dec_byte = sp_decrypt(byte_to_dec, 8, zero_key); - data_dec[i*8 +: 8] = dec_byte; - end - end else begin - // divide the word into sp_width chunks to pass it through the subst/perm network - for (int chunk_lsb = 0; chunk_lsb < data_width; chunk_lsb += sp_width) begin - int bits_remaining = data_width - chunk_lsb; - int chunk_width = (bits_remaining < sp_width) ? bits_remaining : sp_width; - logic chunk[] = new[chunk_width]; - - for (int j = 0; j < chunk_width; j++) begin - chunk[j] = data[chunk_lsb + j]; + if (use_sp_layer) begin + if (data_width == sp_width) begin + // pass the entire word through the subst/perm network at once (the next cases would give the + // same results too, but this should be a bit more efficient) + data_dec = sp_decrypt(data, data_width, zero_key); + end else if (sp_width == 8) begin + // pass each byte of the data through the subst/perm network (special case of the general code + // below) + for (int i = 0; i < data_width / 8; i++) begin + byte_to_dec = data[i*8 +: 8]; + dec_byte = sp_decrypt(byte_to_dec, 8, zero_key); + data_dec[i*8 +: 8] = dec_byte; end - chunk = sp_decrypt(chunk, chunk_width, zero_key); - for (int j = 0; j < chunk_width; j++) begin - data_dec[chunk_lsb + j] = chunk[j]; + end else begin + // divide the word into sp_width chunks to pass it through the subst/perm network + for (int chunk_lsb = 0; chunk_lsb < data_width; chunk_lsb += sp_width) begin + int bits_remaining = data_width - chunk_lsb; + int chunk_width = (bits_remaining < sp_width) ? bits_remaining : sp_width; + logic chunk[] = new[chunk_width]; + + for (int j = 0; j < chunk_width; j++) begin + chunk[j] = data[chunk_lsb + j]; + end + chunk = sp_decrypt(chunk, chunk_width, zero_key); + for (int j = 0; j < chunk_width; j++) begin + data_dec[chunk_lsb + j] = chunk[j]; + end end end - end - // XOR result data with the keystream - for (int i = 0; i < data_width; i++) begin - data_dec[i] = data_dec[i] ^ keystream[i % ks_width]; + // XOR result data with the keystream + for (int i = 0; i < data_width; i++) begin + data_dec[i] = data_dec[i] ^ keystream[i % ks_width]; + end + end else begin + // XOR result data with the keystream + for (int i = 0; i < data_width; i++) begin + data_dec[i] = data[i] ^ keystream[i % ks_width]; + end end return data_dec; diff --git a/hw/dv/verilator/cpp/scrambled_ecc32_mem_area.cc b/hw/dv/verilator/cpp/scrambled_ecc32_mem_area.cc index 65f2b36b610ad..1206c8656990c 100644 --- a/hw/dv/verilator/cpp/scrambled_ecc32_mem_area.cc +++ b/hw/dv/verilator/cpp/scrambled_ecc32_mem_area.cc @@ -162,9 +162,10 @@ void ScrambledEcc32MemArea::WriteBuffer(uint8_t buf[SV_MEM_WIDTH_BYTES], std::vector ScrambledEcc32MemArea::ReadUnscrambled( const uint8_t buf[SV_MEM_WIDTH_BYTES], uint32_t src_word) const { std::vector scrambled_data(buf, buf + GetPhysWidthByte()); - return scramble_decrypt_data( - scrambled_data, GetPhysWidth(), 39, AddrIntToBytes(src_word, addr_width_), - addr_width_, GetScrambleNonce(), GetScrambleKey(), repeat_keystream_); + return scramble_decrypt_data(scrambled_data, GetPhysWidth(), 39, + AddrIntToBytes(src_word, addr_width_), + addr_width_, GetScrambleNonce(), + GetScrambleKey(), repeat_keystream_, false); } void ScrambledEcc32MemArea::ReadBuffer(std::vector &data, @@ -196,7 +197,8 @@ void ScrambledEcc32MemArea::ScrambleBuffer(uint8_t buf[SV_MEM_WIDTH_BYTES], // Scramble data with integrity scramble_buf = scramble_encrypt_data( scramble_buf, GetPhysWidth(), 39, AddrIntToBytes(dst_word, addr_width_), - addr_width_, GetScrambleNonce(), GetScrambleKey(), repeat_keystream_); + addr_width_, GetScrambleNonce(), GetScrambleKey(), repeat_keystream_, + false); // Copy scrambled data to write buffer std::copy(scramble_buf.begin(), scramble_buf.end(), &buf[0]); diff --git a/hw/ip/otbn/rtl/otbn.sv b/hw/ip/otbn/rtl/otbn.sv index 03c0b01cc13fe..d35f08c9aa6b1 100644 --- a/hw/ip/otbn/rtl/otbn.sv +++ b/hw/ip/otbn/rtl/otbn.sv @@ -332,8 +332,7 @@ module otbn .Width (39), .Depth (ImemSizeWords), .DataBitsPerMask(39), - .EnableParity (0), - .DiffWidth (39) + .EnableParity (0) ) u_imem ( .clk_i, .rst_ni(rst_n), @@ -535,7 +534,6 @@ module otbn .Depth (DmemSizeWords), .DataBitsPerMask (39), .EnableParity (0), - .DiffWidth (39), .ReplicateKeyStream(1) ) u_dmem ( .clk_i, diff --git a/hw/ip/prim/dv/prim_ram_scr/cpp/scramble_model.cc b/hw/ip/prim/dv/prim_ram_scr/cpp/scramble_model.cc index e8f7d7af8b053..14a67901c6e1d 100644 --- a/hw/ip/prim/dv/prim_ram_scr/cpp/scramble_model.cc +++ b/hw/ip/prim/dv/prim_ram_scr/cpp/scramble_model.cc @@ -322,7 +322,7 @@ std::vector scramble_encrypt_data( const std::vector &data_in, uint32_t data_width, uint32_t subst_perm_width, const std::vector &addr, uint32_t addr_width, const std::vector &nonce, - const std::vector &key, bool repeat_keystream) { + const std::vector &key, bool repeat_keystream, bool use_sp_layer) { assert(data_in.size() == ((data_width + 7) / 8)); assert(addr.size() == ((addr_width + 7) / 8)); @@ -335,28 +335,32 @@ std::vector scramble_encrypt_data( auto data_enc = xor_vectors(data_in, keystream); - return scramble_subst_perm_full_width(data_enc, data_width, subst_perm_width, - true); + if (use_sp_layer) { + return scramble_subst_perm_full_width(data_enc, data_width, + subst_perm_width, true); + } else { + return data_enc; + } } std::vector scramble_decrypt_data( const std::vector &data_in, uint32_t data_width, uint32_t subst_perm_width, const std::vector &addr, uint32_t addr_width, const std::vector &nonce, - const std::vector &key, bool repeat_keystream) { + const std::vector &key, bool repeat_keystream, bool use_sp_layer) { assert(data_in.size() == ((data_width + 7) / 8)); assert(addr.size() == ((addr_width + 7) / 8)); - // Data is decrypted by reversing substitution/permutation layer then XORing - // with keystream - auto data_sp_out = scramble_subst_perm_full_width(data_in, data_width, - subst_perm_width, false); - auto keystream = scramble_gen_keystream(addr, addr_width, nonce, key, data_width, kNumPrinceHalfRounds, repeat_keystream); - - auto data_dec = xor_vectors(data_sp_out, keystream); - - return data_dec; + if (use_sp_layer) { + // Data is decrypted by reversing substitution/permutation layer then XORing + // with keystream + auto data_sp_out = scramble_subst_perm_full_width(data_in, data_width, + subst_perm_width, false); + return xor_vectors(data_sp_out, keystream); + } else { + return xor_vectors(data_in, keystream); + } } diff --git a/hw/ip/prim/dv/prim_ram_scr/cpp/scramble_model.h b/hw/ip/prim/dv/prim_ram_scr/cpp/scramble_model.h index 4a3875d29b973..9a14405eb8429 100644 --- a/hw/ip/prim/dv/prim_ram_scr/cpp/scramble_model.h +++ b/hw/ip/prim/dv/prim_ram_scr/cpp/scramble_model.h @@ -40,13 +40,16 @@ std::vector scramble_addr(const std::vector &addr_in, * @param repeat_keystream Repeat the keystream of one single PRINCE instance if * set to true. Otherwise multiple PRINCE instances are * used. + * @param use_sp_layer Use the S&P layer for data diffusion. In HW this is + * disabled by default since it interacts adversely with + * the end-to-end integrity scheme. * @return Byte vector with decrypted data */ std::vector scramble_decrypt_data( const std::vector &data_in, uint32_t data_width, uint32_t subst_perm_width, const std::vector &addr, uint32_t addr_width, const std::vector &nonce, - const std::vector &key, bool repeat_keystream); + const std::vector &key, bool repeat_keystream, bool use_sp_layer); /** Encrypt scrambled data * @param data_in Byte vector of data to encrypt @@ -60,12 +63,15 @@ std::vector scramble_decrypt_data( * @param repeat_keystream Repeat the keystream of one single PRINCE instance if * set to true. Otherwise multiple PRINCE instances are * used. + * @param use_sp_layer Use the S&P layer for data diffusion. In HW this is + * disabled by default since it interacts adversely with + * the end-to-end integrity scheme. * @return Byte vector with encrypted data */ std::vector scramble_encrypt_data( const std::vector &data_in, uint32_t data_width, uint32_t subst_perm_width, const std::vector &addr, uint32_t addr_width, const std::vector &nonce, - const std::vector &key, bool repeat_keystream); + const std::vector &key, bool repeat_keystream, bool use_sp_layer); #endif // OPENTITAN_HW_IP_PRIM_DV_PRIM_RAM_SCR_CPP_SCRAMBLE_MODEL_H_ diff --git a/hw/ip/prim/rtl/prim_ram_1p_scr.sv b/hw/ip/prim/rtl/prim_ram_1p_scr.sv index 30655819c6d7a..36f786a7c2b8d 100644 --- a/hw/ip/prim/rtl/prim_ram_1p_scr.sv +++ b/hw/ip/prim/rtl/prim_ram_1p_scr.sv @@ -34,8 +34,11 @@ module prim_ram_1p_scr import prim_ram_1p_pkg::*; #( // to 2*5 + 1 effective rounds. Setting this to 2 halves this to approximately 5 effective rounds. // Number of PRINCE half rounds, can be [1..5] parameter int NumPrinceRoundsHalf = 2, - // Number of extra diffusion rounds. Setting this to 0 to disable diffusion. - parameter int NumDiffRounds = 2, + // Number of extra diffusion rounds. Setting this to 0 to disables diffusion. + // NOTE: this is zero by default, since the non-linear transformation of data bits can interact + // adversely with end-to-end ECC integrity. Only enable this if you know what you are doing + // (e.g. using this primitive in a different context with byte parity). See #20788 for context. + parameter int NumDiffRounds = 0, // This parameter governs the block-width of additional diffusion layers. // For intra-byte diffusion, set this parameter to 8. parameter int DiffWidth = DataBitsPerMask, diff --git a/hw/ip/rom_ctrl/doc/rom_ctrl_blockdiag.svg b/hw/ip/rom_ctrl/doc/rom_ctrl_blockdiag.svg index 969c1f57c5782..f4069c8d40ec1 100644 --- a/hw/ip/rom_ctrl/doc/rom_ctrl_blockdiag.svg +++ b/hw/ip/rom_ctrl/doc/rom_ctrl_blockdiag.svg @@ -1,12 +1,5 @@ + height="779.99994" + xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" + xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" + xmlns="http://www.w3.org/2000/svg" + xmlns:svg="http://www.w3.org/2000/svg" + xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" + xmlns:cc="http://creativecommons.org/ns#" + xmlns:dc="http://purl.org/dc/elements/1.1/"> @@ -716,13 +716,13 @@ guidetolerance="10" inkscape:pageopacity="0" inkscape:pageshadow="2" - inkscape:window-width="1920" - inkscape:window-height="1043" + inkscape:window-width="3440" + inkscape:window-height="1376" id="namedview2851" showgrid="false" inkscape:zoom="1.7927814" - inkscape:cx="1069.3263" - inkscape:cy="296.72112" + inkscape:cx="1069.5671" + inkscape:cy="296.46671" inkscape:window-x="0" inkscape:window-y="0" inkscape:window-maximized="1" @@ -733,7 +733,8 @@ fit-margin-top="0" fit-margin-left="0" fit-margin-right="0" - fit-margin-bottom="0"> + fit-margin-bottom="0" + inkscape:pagecheckerboard="0"> S&P - - - S&P - (trunc) - int: return subst_perm_dec(phy_addr, addr_scr_nonce, self._addr_width, self.subst_perm_rounds) - def data_sp_enc(self, width: int, data: int) -> int: - return subst_perm_enc(data, 0, width, self.subst_perm_rounds) - - def data_sp_dec(self, width: int, data: int) -> int: - return subst_perm_dec(data, 0, width, self.subst_perm_rounds) - def scramble_word(self, width: int, log_addr: int, clr_data: int) -> int: '''Scramble clr_data at the given logical address.''' keystream = self.get_keystream(log_addr, width) - return self.data_sp_enc(width, keystream ^ clr_data) + return keystream ^ clr_data def unscramble_word(self, width: int, log_addr: int, scr_data: int) -> int: keystream = self.get_keystream(log_addr, width) - sp_scr_data = self.data_sp_dec(width, scr_data) - return keystream ^ sp_scr_data + return keystream ^ scr_data def scramble(self, mem: MemFile) -> MemFile: assert len(mem.chunks) == 1 @@ -259,15 +252,15 @@ def scramble(self, mem: MemFile) -> MemFile: # # Then, for all i, we have: # - # clr[i] = PRINCE(i) ^ data_sp_dec(scr[addr_sp_enc(i)]) + # clr[i] = PRINCE(i) ^ scr[addr_sp_enc(i)] # # Change coordinates by evaluating at addr_sp_dec(i): # - # clr[addr_sp_dec(i)] = PRINCE(addr_sp_dec(i)) ^ data_sp_dec(scr[i]) + # clr[addr_sp_dec(i)] = PRINCE(addr_sp_dec(i)) ^ scr[i] # # so # - # scr[i] = data_sp_enc(clr[addr_sp_dec(i)] ^ PRINCE(addr_sp_dec(i))) + # scr[i] = clr[addr_sp_dec(i)] ^ PRINCE(addr_sp_dec(i)) # # Using the scramble_word helper function, this is: # diff --git a/hw/ip/sram_ctrl/doc/sram_ctrl_blockdiag.svg b/hw/ip/sram_ctrl/doc/sram_ctrl_blockdiag.svg index 9315deab90b6c..3de69171db52f 100644 --- a/hw/ip/sram_ctrl/doc/sram_ctrl_blockdiag.svg +++ b/hw/ip/sram_ctrl/doc/sram_ctrl_blockdiag.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/hw/ip/sram_ctrl/doc/theory_of_operation.md b/hw/ip/sram_ctrl/doc/theory_of_operation.md index f3de13bd8554c..206d834555429 100644 --- a/hw/ip/sram_ctrl/doc/theory_of_operation.md +++ b/hw/ip/sram_ctrl/doc/theory_of_operation.md @@ -26,11 +26,8 @@ The individual mechanisms are explained in more detail in the subsections below. ## Scrambling Primitive As explained in [`prim_ram_1p_scr`](../../prim/doc/prim_ram_1p_scr.md) the scrambling mechanism employs a reduced-round PRINCE block cipher in CTR mode to scramble the data. -Since plain CTR mode does not diffuse the data bits due to the bitwise XOR, the scheme is augmented by passing each word through a shallow substitution-permutation (S&P) network implemented with the `prim_subst_perm` primitive. -The S&P network employed is similar to the one employed in PRESENT and is explained in more detail [here](../../prim/doc/prim_ram_1p_scr.md#custom-substitution-permutation-network). -Another CTR mode augmentation that is aimed at breaking the linear address space is SRAM address scrambling. -The same S&P network construction that is used for intra-word diffusion is leveraged to non-linearly remap the SRAM address as shown in the block diagram above. +In order to break the linear address space, the CTR mode is augmented with an S&P network to non-linearly remap the SRAM address as shown in the block diagram above. The S&P network employed is similar to the one employed in PRESENT and is explained in more detail [here](../../prim/doc/prim_ram_1p_scr.md#custom-substitution-permutation-network). ### Integrity Error Handling diff --git a/hw/ip/sram_ctrl/rtl/sram_ctrl.sv b/hw/ip/sram_ctrl/rtl/sram_ctrl.sv index e2185e0214b19..7a1035e5dd043 100644 --- a/hw/ip/sram_ctrl/rtl/sram_ctrl.sv +++ b/hw/ip/sram_ctrl/rtl/sram_ctrl.sv @@ -492,8 +492,7 @@ module sram_ctrl .Width(DataWidth), .Depth(Depth), .EnableParity(0), - .DataBitsPerMask(DataWidth), - .DiffWidth(DataWidth) + .DataBitsPerMask(DataWidth) ) u_prim_ram_1p_scr ( .clk_i, .rst_ni,