diff --git a/stl/inc/random b/stl/inc/random
index 2f4fc83cc2..9fc33d3e15 100644
--- a/stl/inc/random
+++ b/stl/inc/random
@@ -261,30 +261,37 @@ private:
     vector<result_type> _Myvec;
 };
 
-_NODISCARD constexpr int _Generate_canonical_iterations(const int _Bits, const uint64_t _Gmin, const uint64_t _Gmax) {
-    // For a URBG type `G` with range == `(G::max() - G::min()) + 1`, returns the number of calls to generate at least
-    // _Bits bits of entropy. Specifically, max(1, ceil(_Bits / log2(range))).
-
-    _STL_INTERNAL_CHECK(0 <= _Bits && _Bits <= 64);
+struct _Urbg_gen_can_params {
+    bool _Rx_is_pow2 = false; // $R$ (in N4981 [rand.util.canonical]/1.2) is a power of 2
+    int _Kx          = 0; // $k$ in N4981 [rand.util.canonical]/1.4
+    uint64_t _Xx     = 0; // $x$ in N4981 [rand.util.canonical]/1.5
+    float _Scale     = 0.0f; // $r^{-d}$, always representable as a float
+    int _Smax_bits   = 0; // number of bits in $R^k$
+};
 
-    if (_Bits == 0 || (_Gmax == UINT64_MAX && _Gmin == 0)) {
-        return 1;
+_NODISCARD constexpr _Urbg_gen_can_params _Generate_canonical_params(const size_t _Bits, const uint64_t _Rm1) {
+    const auto _Rx        = _Unsigned128{_Rm1} + 1; // $R$ in N4981 [rand.util.canonical]/1.2
+    const auto _Target    = _Unsigned128{1} << _Bits; // $r^d$
+    _Unsigned128 _Product = 1;
+    _Urbg_gen_can_params _Result;
+    _Result._Rx_is_pow2 = (_Rx & (_Rx - 1)) == 0;
+    _Result._Kx         = 0;
+    while (_Product < _Target) {
+        _Product *= _Rx;
+        ++_Result._Kx;
     }
 
-    const auto _Range  = (_Gmax - _Gmin) + 1;
-    const auto _Target = ~uint64_t{0} >> (64 - _Bits);
-    uint64_t _Prod     = 1;
-    int _Ceil          = 0;
-    while (_Prod <= _Target) {
-        ++_Ceil;
-        if (_Prod > UINT64_MAX / _Range) {
-            break;
-        }
-
-        _Prod *= _Range;
+    _Result._Xx        = static_cast<uint64_t>(_Product / _Target);
+    _Result._Scale     = 1.0f / static_cast<float>(1ULL << _Bits);
+    _Result._Smax_bits = 0;
+    // $S \in [0, _Product)$
+    --_Product;
+    while (_Product > 0) {
+        ++_Result._Smax_bits;
+        _Product >>= 1;
     }
 
-    return _Ceil;
+    return _Result;
 }
 
 _EXPORT_STD template <class _Real, size_t _Bits, class _Gen>
@@ -294,27 +301,110 @@ _NODISCARD _Real generate_canonical(_Gen& _Gx) { // build a floating-point value
     constexpr auto _Digits  = static_cast<size_t>(numeric_limits<_Real>::digits);
     constexpr auto _Minbits = static_cast<int>(_Digits < _Bits ? _Digits : _Bits);
 
-    constexpr auto _Gxmin = static_cast<_Real>((_Gen::min)());
-    constexpr auto _Gxmax = static_cast<_Real>((_Gen::max)());
-    constexpr auto _Rx    = (_Gxmax - _Gxmin) + _Real{1};
+    if constexpr (_Minbits == 0) {
+        return _Real{0};
+    } else {
+        using _Result_uint_type = conditional_t<_Minbits <= 32, uint32_t, uint64_t>;
+
+        constexpr auto _Gxmin  = (_Gen::min)();
+        constexpr auto _Gxmax  = (_Gen::max)();
+        constexpr auto _Params = _Generate_canonical_params(_Minbits, _Gxmax - _Gxmin);
+
+        using _Sx_type = conditional_t<_Params._Smax_bits <= 32, uint32_t,
+            conditional_t<_Params._Smax_bits <= 64, uint64_t, _Unsigned128>>;
+        _Sx_type _Sx;
+
+        if constexpr (_Params._Rx_is_pow2) {
+            // Always needs only one attempt. Multiplications can be replaced with shift/add. Optimize k=1 case.
+            if constexpr (_Params._Kx == 1) {
+                _Sx = static_cast<_Sx_type>(_Gx() - _Gxmin);
+            } else if constexpr (_Params._Kx > 1) {
+                constexpr int _Rx_bits = _Params._Smax_bits / _Params._Kx;
+                _Sx                    = 0;
+                int _Shift             = 0;
+                for (int _Idx = 0; _Idx < _Params._Kx; ++_Idx) {
+                    _Sx += static_cast<_Sx_type>(_Gx() - _Gxmin) << _Shift;
+                    _Shift += _Rx_bits;
+                }
+            } else {
+                _STL_INTERNAL_STATIC_ASSERT(false); // unexpected k
+            }
 
-    constexpr int _Kx = _Generate_canonical_iterations(_Minbits, (_Gen::min)(), (_Gen::max)());
+            _Sx >>= _Params._Smax_bits - _Minbits;
+            return static_cast<_Real>(static_cast<_Result_uint_type>(_Sx)) * static_cast<_Real>(_Params._Scale);
+        } else {
+            constexpr auto _Rx        = _Sx_type{_Gxmax - _Gxmin} + 1u;
+            constexpr _Sx_type _Limit = static_cast<_Sx_type>(_Params._Xx) * (_Sx_type{1} << _Minbits);
+
+            do {
+                // unroll first two iterations to avoid unnecessary multiplications
+                _Sx = static_cast<_Sx_type>(_Gx() - _Gxmin);
+                if constexpr (_Params._Kx == 2) {
+                    _Sx += static_cast<_Sx_type>(_Gx() - _Gxmin) * _Rx;
+                } else if constexpr (_Params._Kx > 2) {
+                    _Sx += static_cast<_Sx_type>(_Gx() - _Gxmin) * _Rx;
+                    _Sx_type _Factor = _Rx;
+                    for (int _Idx = 2; _Idx < _Params._Kx; ++_Idx) {
+                        _Factor *= _Rx;
+                        _Sx += static_cast<_Sx_type>(_Gx() - _Gxmin) * _Factor;
+                    }
+                }
+            } while (_Sx >= _Limit);
 
-    _Real _Ans{0};
-    _Real _Factor{1};
+            return static_cast<_Real>(static_cast<_Result_uint_type>(_Sx / _Params._Xx))
+                 * static_cast<_Real>(_Params._Scale);
+        }
+    }
+}
 
-    for (int _Idx = 0; _Idx < _Kx; ++_Idx) { // add in another set of bits
-        _Ans += (static_cast<_Real>(_Gx()) - _Gxmin) * _Factor;
-        _Factor *= _Rx;
+template <class _Uint_type, class _Gen, class _Real>
+bool _Nrand_for_tr1(
+    const uint64_t _Rd, const uint64_t _Rx, _Gen& _Gx, _Real& _Val, uint64_t& _Sx_init, uint64_t& _Factor_init) {
+    // Minimally-constexpr generate_canonical algorithm. Will save work and exit if _Factor would overflow.
+    _Uint_type _Sx             = _Sx_init;
+    _Uint_type _Factor         = _Factor_init;
+    const auto _Gxmin          = (_Gx.min)();
+    const auto _Overflow_limit = UINT64_MAX / _Rx;
+
+    _Uint_type _Xx    = 0;
+    _Uint_type _Limit = 0;
+    bool _Init        = false;
+    for (;;) {
+        while (_Factor < _Rd) {
+            if constexpr (is_same_v<_Uint_type, uint64_t>) {
+                if (_Factor > _Overflow_limit) {
+                    _Sx_init     = _Sx;
+                    _Factor_init = _Factor;
+                    return true;
+                }
+            }
+
+            _Sx += (_Gx() - _Gxmin) * _Factor;
+            _Factor *= _Rx;
+        }
+
+        if (!_Init) {
+            _Init  = true;
+            _Xx    = _Factor / _Rd;
+            _Limit = _Xx * _Rd;
+        }
+
+        if (_Sx < _Limit) {
+            break;
+        } else {
+            _Sx     = 0;
+            _Factor = 1;
+        }
     }
 
-    return _Ans / _Factor;
+    _Val = static_cast<_Real>(static_cast<uint64_t>(_Sx / _Xx));
+    return false;
 }
 
 template <class _Gen, class = void>
 struct _Has_static_min_max : false_type {};
 
-// This checks a requirement of N4950 [rand.req.urng] `concept uniform_random_bit_generator` but doesn't attempt
+// This checks a requirement of N4981 [rand.req.urng] `concept uniform_random_bit_generator` but doesn't attempt
 // to implement the whole concept - we just need to distinguish Standard machinery from tr1 machinery.
 template <class _Gen>
 struct _Has_static_min_max<_Gen, void_t<decltype(bool_constant<(_Gen::min)() < (_Gen::max)()>::value)>> : true_type {};
@@ -323,18 +413,40 @@ template <class _Real, class _Gen>
 _NODISCARD _Real _Nrand_impl(_Gen& _Gx) { // build a floating-point value from random sequence
     _RNG_REQUIRE_REALTYPE(_Nrand_impl, _Real);
 
-    constexpr auto _Digits  = static_cast<size_t>(numeric_limits<_Real>::digits);
-    constexpr auto _Bits    = ~size_t{0};
-    constexpr auto _Minbits = _Digits < _Bits ? _Digits : _Bits;
+    constexpr auto _Digits = static_cast<size_t>(numeric_limits<_Real>::digits);
+
+    if constexpr (_Has_static_min_max<_Gen>::value) {
+        return _STD generate_canonical<_Real, _Digits>(_Gx);
+    } else if constexpr (is_integral_v<typename _Gen::result_type>) {
+        // TRANSITION, for integral tr1 machinery only; Standard machinery can call generate_canonical directly
+        const auto _Gxmax = (_Gx.max)();
+        const auto _Gxmin = (_Gx.min)();
+        _Real _Val{0};
+
+        if (_Gxmax == UINT64_MAX && _Gxmin == 0) {
+            // special case when uint64_t can't hold the full URBG range
+            _Val = static_cast<_Real>(static_cast<uint64_t>(_Gx()) >> (64 - _Digits));
+        } else {
+            constexpr auto _Rd = 1ULL << _Digits;
+            const auto _Rx     = static_cast<uint64_t>(_Gxmax - _Gxmin) + 1;
+            uint64_t _Sx       = 0;
+            uint64_t _Factor   = 1;
+
+            // Try with 64 bits first, upgrade to 128 if necessary.
+            const bool _Would_overflow = _Nrand_for_tr1<uint64_t>(_Rd, _Rx, _Gx, _Val, _Sx, _Factor);
+            if (_Would_overflow) {
+                _Nrand_for_tr1<_Unsigned128>(_Rd, _Rx, _Gx, _Val, _Sx, _Factor);
+            }
+        }
 
-    if constexpr (_Has_static_min_max<_Gen>::value && _Minbits <= 64) {
-        return _STD generate_canonical<_Real, _Minbits>(_Gx);
-    } else { // TRANSITION, for tr1 machinery only; Standard machinery can call generate_canonical directly
+        return _STD ldexp(_Val, -static_cast<int>(_Digits));
+    } else {
+        // TRANSITION, the pre-P0952R2 algorithm, for non-integral tr1 machinery only (e.g. subtract_with_carry_01)
         const _Real _Gxmin = static_cast<_Real>((_Gx.min)());
         const _Real _Gxmax = static_cast<_Real>((_Gx.max)());
         const _Real _Rx    = (_Gxmax - _Gxmin) + _Real{1};
 
-        const int _Ceil = static_cast<int>(_STD ceil(static_cast<_Real>(_Minbits) / _STD log2(_Rx)));
+        const int _Ceil = static_cast<int>(_STD ceil(static_cast<_Real>(_Digits) / _STD log2(_Rx)));
         const int _Kx   = _Ceil < 1 ? 1 : _Ceil;
 
         _Real _Ans{0};
@@ -1968,8 +2080,8 @@ public:
     explicit _Rng_from_urng_v2(_Urng& _Func) noexcept : _Ref(_Func) {}
 
     _Diff operator()(_Diff _Index) { // adapt _Urng closed range to [0, _Index)
-        // From Daniel Lemire, "Fast Random Integer Generation in an Interval", ACM Trans. Model. Comput. Simul. 29 (1),
-        // 2019.
+        // From Daniel Lemire, "Fast Random Integer Generation in an Interval",
+        // ACM Trans. Model. Comput. Simul. 29 (1), 2019.
         //
         // Algorithm 5 <-> This Code:
         //   m <-> _Product
@@ -2925,8 +3037,8 @@ public:
 
 private:
     template <class _Engine>
-    result_type _Eval(
-        _Engine& _Eng, const param_type& _Par0) const { // Press et al., Numerical Recipes in C, 2nd ed., pp 295-296.
+    result_type _Eval(_Engine& _Eng, const param_type& _Par0) const {
+        // Press et al., Numerical Recipes in C, 2nd ed., pp 295-296.
         _Ty _Res;
         if (_Par0._Tx < 25) { // generate directly
             _Res = 0;
diff --git a/stl/inc/yvals_core.h b/stl/inc/yvals_core.h
index 8a0e7d29d1..a840302fba 100644
--- a/stl/inc/yvals_core.h
+++ b/stl/inc/yvals_core.h
@@ -62,6 +62,7 @@
 // P0883R2 Fixing Atomic Initialization
 // P0935R0 Eradicating Unnecessarily Explicit Default Constructors
 // P0941R2 Feature-Test Macros
+// P0952R2 A New Specification For generate_canonical()
 // P0972R0 noexcept For <chrono> zero(), min(), max()
 // P1065R2 constexpr INVOKE
 //     (the std::invoke function only; other components like bind and reference_wrapper are C++20 only)
diff --git a/tests/libcxx/expected_results.txt b/tests/libcxx/expected_results.txt
index b3f0faf60b..120b14e2fa 100644
--- a/tests/libcxx/expected_results.txt
+++ b/tests/libcxx/expected_results.txt
@@ -67,6 +67,9 @@ std/containers/unord/unord.set/insert_and_emplace_allocator_requirements.pass.cp
 # Bogus test believes that copyability of array<T, 0> must be the same as array<T, 1>
 std/containers/sequences/array/array.cons/implicit_copy.pass.cpp FAIL
 
+# libc++ hasn't implemented P0952R2, which changes the generate_canonical algorithm.
+std/numerics/rand/rand.util/rand.util.canonical/generate_canonical.pass.cpp FAIL
+
 # Test expects __cpp_lib_chrono to have the old value 201611L for P0505R0; we define the C++20 value 201907L for P1466R3.
 std/language.support/support.limits/support.limits.general/chrono.version.compile.pass.cpp FAIL
 
diff --git a/tests/std/test.lst b/tests/std/test.lst
index 3b22ab147a..97c4083505 100644
--- a/tests/std/test.lst
+++ b/tests/std/test.lst
@@ -198,7 +198,6 @@ tests\GH_001850_clog_tied_to_cout
 tests\GH_001858_iostream_exception
 tests\GH_001912_random_distribution_operator_const
 tests\GH_001914_cached_position
-tests\GH_001964_constexpr_generate_canonical
 tests\GH_002030_asan_annotate_string
 tests\GH_002030_asan_annotate_vector
 tests\GH_002039_byte_is_not_trivially_swappable
@@ -511,6 +510,7 @@ tests\P0898R3_identity
 tests\P0912R5_coroutine
 tests\P0919R3_heterogeneous_unordered_lookup
 tests\P0943R6_stdatomic_h
+tests\P0952R2_new_generate_canonical
 tests\P0966R1_string_reserve_should_not_shrink
 tests\P0980R1_constexpr_strings
 tests\P1004R2_constexpr_vector
diff --git a/tests/std/tests/GH_001017_discrete_distribution_out_of_range/env.lst b/tests/std/tests/GH_001017_discrete_distribution_out_of_range/env.lst
index 19f025bd0e..8b6c540b4b 100644
--- a/tests/std/tests/GH_001017_discrete_distribution_out_of_range/env.lst
+++ b/tests/std/tests/GH_001017_discrete_distribution_out_of_range/env.lst
@@ -1,4 +1,5 @@
 # Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-RUNALL_INCLUDE ..\usual_matrix.lst
+# Skip /clr:pure configurations; they pass but take an extremely long time to run, risking test timeouts.
+RUNALL_INCLUDE ..\impure_matrix.lst
diff --git a/tests/std/tests/GH_001964_constexpr_generate_canonical/test.cpp b/tests/std/tests/GH_001964_constexpr_generate_canonical/test.cpp
deleted file mode 100644
index b8d40cb9f1..0000000000
--- a/tests/std/tests/GH_001964_constexpr_generate_canonical/test.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include <cassert>
-#include <cmath>
-#include <cstdint>
-#include <random>
-#include <utility>
-
-using namespace std;
-
-#define STATIC_ASSERT(...) static_assert(__VA_ARGS__, #__VA_ARGS__)
-
-int naive_iterations(const int bits, const uint64_t gmin, const uint64_t gmax) {
-    // Naive implementation of [rand.util.canonical]. Note that for large values of range, it's possible that
-    // log2(range) == bits when range < 2^bits. This can lead to incorrect results, so we can't use this function as
-    // a reference for all values.
-
-    const double range = static_cast<double>(gmax) - static_cast<double>(gmin) + 1.0;
-    return max(1, static_cast<int>(ceil(static_cast<double>(bits) / log2(range))));
-}
-
-void test(const int target_bits) {
-    // Increase the range until the number of iterations repeats.
-    uint64_t range = 2;
-    int k          = 0;
-    int prev_k     = -1;
-    while (k != prev_k) {
-        prev_k       = exchange(k, naive_iterations(target_bits, 1, range));
-        const int k1 = _Generate_canonical_iterations(target_bits, 1, range);
-        assert(k == k1);
-        ++range;
-    }
-
-    // Now only check the crossover points, where incrementing the range actually causes the number of iterations to
-    // increase.
-    while (--k != 0) {
-        // The largest range such that k iterations generating [1,range] produces less than target_bits bits.
-        if (k == 1) {
-            range = ~uint64_t{0} >> (64 - target_bits);
-        } else {
-            range = static_cast<uint64_t>(ceil(pow(2.0, static_cast<double>(target_bits) / k))) - 1;
-        }
-
-        int k0 = (k == 1) ? 2 : naive_iterations(target_bits, 1, range);
-        int k1 = _Generate_canonical_iterations(target_bits, 1, range);
-        assert(k0 == k1);
-        assert(k1 == k + 1);
-
-        k0 = (k == 1) ? 1 : naive_iterations(target_bits, 0, range);
-        k1 = _Generate_canonical_iterations(target_bits, 0, range);
-        assert(k0 == k1);
-        assert(k1 == k);
-    }
-}
-
-int main() {
-    STATIC_ASSERT(_Generate_canonical_iterations(53, 1, uint64_t{1} << 32) == 2);
-    STATIC_ASSERT(_Generate_canonical_iterations(64, 0, ~uint64_t{0}) == 1);
-
-    for (int bits = 0; bits <= 64; ++bits) {
-        test(bits);
-    }
-}
diff --git a/tests/std/tests/GH_001964_constexpr_generate_canonical/env.lst b/tests/std/tests/P0952R2_new_generate_canonical/env.lst
similarity index 100%
rename from tests/std/tests/GH_001964_constexpr_generate_canonical/env.lst
rename to tests/std/tests/P0952R2_new_generate_canonical/env.lst
diff --git a/tests/std/tests/P0952R2_new_generate_canonical/test.cpp b/tests/std/tests/P0952R2_new_generate_canonical/test.cpp
new file mode 100644
index 0000000000..187d202a28
--- /dev/null
+++ b/tests/std/tests/P0952R2_new_generate_canonical/test.cpp
@@ -0,0 +1,206 @@
+// Copyright (c) Microsoft Corporation.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#define _SILENCE_TR1_RANDOM_DEPRECATION_WARNING
+
+#include <__msvc_int128.hpp>
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <random>
+
+using namespace std;
+
+void test_lwg2524() {
+    mt19937_64 mt2(1);
+    mt2.discard(517517);
+    assert((generate_canonical<float, 32>) (mt2) < 1.0f);
+    assert((generate_canonical<float, 32>) (mt2) < 1.0f);
+    assert((generate_canonical<float, 32>) (mt2) < 1.0f);
+}
+
+void test_tr1_16() {
+    using E_std = linear_congruential_engine<uint32_t, 75, 74, 65537>;
+    using E_tr1 = linear_congruential<uint32_t, 75, 74, 65537>;
+
+    E_std std_eng;
+    E_tr1 tr1_eng;
+
+    for (int i = 0; i < 10; ++i) {
+        assert(std_eng() == tr1_eng());
+    }
+
+    std_eng.seed();
+    tr1_eng.seed();
+
+    uniform_real_distribution<float> dist_32;
+    for (int i = 0; i < 10; ++i) {
+        assert(dist_32(std_eng) == dist_32(tr1_eng));
+    }
+
+    uniform_real_distribution<double> dist_64;
+    for (int i = 0; i < 10; ++i) {
+        assert(dist_64(std_eng) == dist_64(tr1_eng));
+    }
+}
+
+void test_tr1_32() {
+    using E_std = minstd_rand;
+    using E_tr1 = linear_congruential<unsigned int, 48271, 0, 2147483647>;
+
+    E_std std_eng;
+    E_tr1 tr1_eng;
+
+    for (int i = 0; i < 10; ++i) {
+        assert(std_eng() == tr1_eng());
+    }
+
+    std_eng.seed();
+    tr1_eng.seed();
+
+    uniform_real_distribution<float> dist_32;
+    for (int i = 0; i < 10; ++i) {
+        assert(dist_32(std_eng) == dist_32(tr1_eng));
+    }
+
+    uniform_real_distribution<double> dist_64;
+    for (int i = 0; i < 10; ++i) {
+        assert(dist_64(std_eng) == dist_64(tr1_eng));
+    }
+}
+
+void test_tr1_64() {
+    using E_std = mt19937_64;
+    using E_tr1 = mersenne_twister<unsigned long long, 64, 312, 156, 31, 0xb5026f5aa96619e9ULL, 29, 17,
+        0x71d67fffeda60000ULL, 37, 0xfff7eee000000000ULL, 43>;
+
+    E_std std_eng;
+    E_tr1 tr1_eng(E_tr1::default_seed, 0x5555555555555555ULL, 6364136223846793005ULL);
+
+    std_eng.seed();
+    tr1_eng.seed(E_tr1::default_seed, 6364136223846793005ULL);
+
+    for (int i = 0; i < 10; ++i) {
+        assert(std_eng() == tr1_eng());
+    }
+
+    std_eng.seed();
+    tr1_eng.seed(E_tr1::default_seed, 6364136223846793005ULL);
+
+    uniform_real_distribution<float> dist_32;
+    for (int i = 0; i < 10; ++i) {
+        assert(dist_32(std_eng) == dist_32(tr1_eng));
+    }
+
+    uniform_real_distribution<double> dist_64;
+    for (int i = 0; i < 10; ++i) {
+        assert(dist_64(std_eng) == dist_64(tr1_eng));
+    }
+}
+
+template <class Real, size_t Bits, class Engine>
+Real generate_with_ibe() {
+    independent_bits_engine<Engine, Bits, uint64_t> ibe;
+    return generate_canonical<Real, 64>(ibe);
+}
+
+int main() {
+    {
+        // edge case, Bits == 0
+        using Engine = ranlux24;
+        Engine eng;
+        assert((generate_canonical<double, 0, Engine>) (eng) == 0.0);
+        assert(eng() == Engine{}());
+    }
+
+    {
+        // float, URBG range is a power of two
+        using Engine                = ranlux24;
+        constexpr uint32_t values[] = {0xE57B2C, 0xF91555, 0xD9F2DE};
+        Engine eng;
+        for (const auto& value : values) {
+            assert(eng() == value);
+        }
+
+        const auto expected1 = ldexpf(0xE57B2C >> (1 * 24 - 24), -24);
+        const auto expected2 = ldexpf(0x91555'57B2C >> (2 * 20 - 24), -24);
+        const auto expected3 = ldexpf(0xDE'55'2C >> (3 * 8 - 24), -24);
+
+        assert((generate_with_ibe<float, 24, Engine>) () == expected1);
+        assert((generate_with_ibe<float, 20, Engine>) () == expected2); // needs a 64 bit accumulator for $S$
+        assert((generate_with_ibe<float, 8, Engine>) () == expected3);
+    }
+
+    {
+        // double, URBG range is a power of two
+        using Engine                = mt19937_64;
+        constexpr uint64_t values[] = {0xC96D191C'F6'F6AEA6, 0x401F7AC7'8B'C80F1C, 0xB5EE8CB6AB'E457F8};
+        Engine eng;
+        for (const auto& value : values) {
+            assert(eng() == value);
+        }
+
+        const auto expected1 = ldexp(0xC96D191C'F6F'6AEA6 >> (1 * 64 - 53), -53);
+        const auto expected2 = ldexp(0x8B'C80F1C'F6'F6AEA6 >> (2 * 32 - 53), -53);
+        const auto expected3 =
+            ldexp(static_cast<uint64_t>(_Unsigned128{0x57F8'C80F1C'F6AEA6, 0xE4} >> (3 * 24 - 53)), -53);
+
+
+        assert((generate_with_ibe<double, 64, Engine>) () == expected1);
+        assert((generate_with_ibe<double, 32, Engine>) () == expected2);
+        assert((generate_with_ibe<double, 24, Engine>) () == expected3); // needs a 128 bit accumulator for $S$
+    }
+
+    {
+        // $k \in \{1,2\}$, URBG range is NOT a power of two
+        using Engine                = minstd_rand;
+        constexpr uint32_t values[] = {48271 - 1, 182605794 - 1}; // minstd_rand::min() == 1
+        Engine eng;
+        for (const auto& value : values) {
+            assert(eng() - Engine::min() == value);
+        }
+
+        constexpr uint64_t range = Engine::max() - Engine::min() + 1;
+        constexpr auto x1        = range / (1 << 24);
+        constexpr auto x2        = (range * range) / (1ULL << 53);
+        const float expected1    = ldexpf(static_cast<float>(values[0] / x1), -24);
+        const double expected2   = ldexp((values[0] + range * values[1]) / x2, -53);
+
+        // $k$ == 1
+        eng.seed();
+        assert((generate_canonical<float, 32, Engine>) (eng) == expected1);
+
+        // $k$ == 2
+        eng.seed();
+        assert((generate_canonical<double, 64, Engine>) (eng) == expected2);
+    }
+
+    {
+        // $k = 4, URBG range is NOT a power of two, 128-bit accumulator needed
+
+        // ZX81 generator, R = 65537, https://oeis.org/A357907
+        using Engine                = linear_congruential_engine<uint32_t, 75, 74, 65537>;
+        constexpr uint32_t values[] = {149, 11249, 57305, 38044};
+        Engine eng;
+        for (const auto& value : values) {
+            assert(eng() - Engine::min() == value);
+        }
+
+        constexpr _Unsigned128 range = Engine::max() - Engine::min() + 1;
+        constexpr auto x             = static_cast<uint64_t>((range * range * range * range) / (1ULL << 53));
+        const auto expected          = ldexp(
+            static_cast<uint64_t>((values[0] + range * (values[1] + range * (values[2] + range * values[3]))) / x),
+            -53);
+
+        eng.seed();
+        assert((generate_canonical<double, 64, Engine>) (eng) == expected);
+    }
+
+    test_tr1_16();
+    test_tr1_32();
+    test_tr1_64();
+    test_lwg2524();
+
+    return 0;
+}