diff --git a/stl/inc/random b/stl/inc/random index 2f4fc83cc2..9fc33d3e15 100644 --- a/stl/inc/random +++ b/stl/inc/random @@ -261,30 +261,37 @@ private: vector _Myvec; }; -_NODISCARD constexpr int _Generate_canonical_iterations(const int _Bits, const uint64_t _Gmin, const uint64_t _Gmax) { - // For a URBG type `G` with range == `(G::max() - G::min()) + 1`, returns the number of calls to generate at least - // _Bits bits of entropy. Specifically, max(1, ceil(_Bits / log2(range))). - - _STL_INTERNAL_CHECK(0 <= _Bits && _Bits <= 64); +struct _Urbg_gen_can_params { + bool _Rx_is_pow2 = false; // $R$ (in N4981 [rand.util.canonical]/1.2) is a power of 2 + int _Kx = 0; // $k$ in N4981 [rand.util.canonical]/1.4 + uint64_t _Xx = 0; // $x$ in N4981 [rand.util.canonical]/1.5 + float _Scale = 0.0f; // $r^{-d}$, always representable as a float + int _Smax_bits = 0; // number of bits in $R^k$ +}; - if (_Bits == 0 || (_Gmax == UINT64_MAX && _Gmin == 0)) { - return 1; +_NODISCARD constexpr _Urbg_gen_can_params _Generate_canonical_params(const size_t _Bits, const uint64_t _Rm1) { + const auto _Rx = _Unsigned128{_Rm1} + 1; // $R$ in N4981 [rand.util.canonical]/1.2 + const auto _Target = _Unsigned128{1} << _Bits; // $r^d$ + _Unsigned128 _Product = 1; + _Urbg_gen_can_params _Result; + _Result._Rx_is_pow2 = (_Rx & (_Rx - 1)) == 0; + _Result._Kx = 0; + while (_Product < _Target) { + _Product *= _Rx; + ++_Result._Kx; } - const auto _Range = (_Gmax - _Gmin) + 1; - const auto _Target = ~uint64_t{0} >> (64 - _Bits); - uint64_t _Prod = 1; - int _Ceil = 0; - while (_Prod <= _Target) { - ++_Ceil; - if (_Prod > UINT64_MAX / _Range) { - break; - } - - _Prod *= _Range; + _Result._Xx = static_cast(_Product / _Target); + _Result._Scale = 1.0f / static_cast(1ULL << _Bits); + _Result._Smax_bits = 0; + // $S \in [0, _Product)$ + --_Product; + while (_Product > 0) { + ++_Result._Smax_bits; + _Product >>= 1; } - return _Ceil; + return _Result; } _EXPORT_STD template @@ -294,27 +301,110 @@ _NODISCARD _Real generate_canonical(_Gen& _Gx) { // build a floating-point value constexpr auto _Digits = static_cast(numeric_limits<_Real>::digits); constexpr auto _Minbits = static_cast(_Digits < _Bits ? _Digits : _Bits); - constexpr auto _Gxmin = static_cast<_Real>((_Gen::min)()); - constexpr auto _Gxmax = static_cast<_Real>((_Gen::max)()); - constexpr auto _Rx = (_Gxmax - _Gxmin) + _Real{1}; + if constexpr (_Minbits == 0) { + return _Real{0}; + } else { + using _Result_uint_type = conditional_t<_Minbits <= 32, uint32_t, uint64_t>; + + constexpr auto _Gxmin = (_Gen::min)(); + constexpr auto _Gxmax = (_Gen::max)(); + constexpr auto _Params = _Generate_canonical_params(_Minbits, _Gxmax - _Gxmin); + + using _Sx_type = conditional_t<_Params._Smax_bits <= 32, uint32_t, + conditional_t<_Params._Smax_bits <= 64, uint64_t, _Unsigned128>>; + _Sx_type _Sx; + + if constexpr (_Params._Rx_is_pow2) { + // Always needs only one attempt. Multiplications can be replaced with shift/add. Optimize k=1 case. + if constexpr (_Params._Kx == 1) { + _Sx = static_cast<_Sx_type>(_Gx() - _Gxmin); + } else if constexpr (_Params._Kx > 1) { + constexpr int _Rx_bits = _Params._Smax_bits / _Params._Kx; + _Sx = 0; + int _Shift = 0; + for (int _Idx = 0; _Idx < _Params._Kx; ++_Idx) { + _Sx += static_cast<_Sx_type>(_Gx() - _Gxmin) << _Shift; + _Shift += _Rx_bits; + } + } else { + _STL_INTERNAL_STATIC_ASSERT(false); // unexpected k + } - constexpr int _Kx = _Generate_canonical_iterations(_Minbits, (_Gen::min)(), (_Gen::max)()); + _Sx >>= _Params._Smax_bits - _Minbits; + return static_cast<_Real>(static_cast<_Result_uint_type>(_Sx)) * static_cast<_Real>(_Params._Scale); + } else { + constexpr auto _Rx = _Sx_type{_Gxmax - _Gxmin} + 1u; + constexpr _Sx_type _Limit = static_cast<_Sx_type>(_Params._Xx) * (_Sx_type{1} << _Minbits); + + do { + // unroll first two iterations to avoid unnecessary multiplications + _Sx = static_cast<_Sx_type>(_Gx() - _Gxmin); + if constexpr (_Params._Kx == 2) { + _Sx += static_cast<_Sx_type>(_Gx() - _Gxmin) * _Rx; + } else if constexpr (_Params._Kx > 2) { + _Sx += static_cast<_Sx_type>(_Gx() - _Gxmin) * _Rx; + _Sx_type _Factor = _Rx; + for (int _Idx = 2; _Idx < _Params._Kx; ++_Idx) { + _Factor *= _Rx; + _Sx += static_cast<_Sx_type>(_Gx() - _Gxmin) * _Factor; + } + } + } while (_Sx >= _Limit); - _Real _Ans{0}; - _Real _Factor{1}; + return static_cast<_Real>(static_cast<_Result_uint_type>(_Sx / _Params._Xx)) + * static_cast<_Real>(_Params._Scale); + } + } +} - for (int _Idx = 0; _Idx < _Kx; ++_Idx) { // add in another set of bits - _Ans += (static_cast<_Real>(_Gx()) - _Gxmin) * _Factor; - _Factor *= _Rx; +template +bool _Nrand_for_tr1( + const uint64_t _Rd, const uint64_t _Rx, _Gen& _Gx, _Real& _Val, uint64_t& _Sx_init, uint64_t& _Factor_init) { + // Minimally-constexpr generate_canonical algorithm. Will save work and exit if _Factor would overflow. + _Uint_type _Sx = _Sx_init; + _Uint_type _Factor = _Factor_init; + const auto _Gxmin = (_Gx.min)(); + const auto _Overflow_limit = UINT64_MAX / _Rx; + + _Uint_type _Xx = 0; + _Uint_type _Limit = 0; + bool _Init = false; + for (;;) { + while (_Factor < _Rd) { + if constexpr (is_same_v<_Uint_type, uint64_t>) { + if (_Factor > _Overflow_limit) { + _Sx_init = _Sx; + _Factor_init = _Factor; + return true; + } + } + + _Sx += (_Gx() - _Gxmin) * _Factor; + _Factor *= _Rx; + } + + if (!_Init) { + _Init = true; + _Xx = _Factor / _Rd; + _Limit = _Xx * _Rd; + } + + if (_Sx < _Limit) { + break; + } else { + _Sx = 0; + _Factor = 1; + } } - return _Ans / _Factor; + _Val = static_cast<_Real>(static_cast(_Sx / _Xx)); + return false; } template struct _Has_static_min_max : false_type {}; -// This checks a requirement of N4950 [rand.req.urng] `concept uniform_random_bit_generator` but doesn't attempt +// This checks a requirement of N4981 [rand.req.urng] `concept uniform_random_bit_generator` but doesn't attempt // to implement the whole concept - we just need to distinguish Standard machinery from tr1 machinery. template struct _Has_static_min_max<_Gen, void_t::value)>> : true_type {}; @@ -323,18 +413,40 @@ template _NODISCARD _Real _Nrand_impl(_Gen& _Gx) { // build a floating-point value from random sequence _RNG_REQUIRE_REALTYPE(_Nrand_impl, _Real); - constexpr auto _Digits = static_cast(numeric_limits<_Real>::digits); - constexpr auto _Bits = ~size_t{0}; - constexpr auto _Minbits = _Digits < _Bits ? _Digits : _Bits; + constexpr auto _Digits = static_cast(numeric_limits<_Real>::digits); + + if constexpr (_Has_static_min_max<_Gen>::value) { + return _STD generate_canonical<_Real, _Digits>(_Gx); + } else if constexpr (is_integral_v) { + // TRANSITION, for integral tr1 machinery only; Standard machinery can call generate_canonical directly + const auto _Gxmax = (_Gx.max)(); + const auto _Gxmin = (_Gx.min)(); + _Real _Val{0}; + + if (_Gxmax == UINT64_MAX && _Gxmin == 0) { + // special case when uint64_t can't hold the full URBG range + _Val = static_cast<_Real>(static_cast(_Gx()) >> (64 - _Digits)); + } else { + constexpr auto _Rd = 1ULL << _Digits; + const auto _Rx = static_cast(_Gxmax - _Gxmin) + 1; + uint64_t _Sx = 0; + uint64_t _Factor = 1; + + // Try with 64 bits first, upgrade to 128 if necessary. + const bool _Would_overflow = _Nrand_for_tr1(_Rd, _Rx, _Gx, _Val, _Sx, _Factor); + if (_Would_overflow) { + _Nrand_for_tr1<_Unsigned128>(_Rd, _Rx, _Gx, _Val, _Sx, _Factor); + } + } - if constexpr (_Has_static_min_max<_Gen>::value && _Minbits <= 64) { - return _STD generate_canonical<_Real, _Minbits>(_Gx); - } else { // TRANSITION, for tr1 machinery only; Standard machinery can call generate_canonical directly + return _STD ldexp(_Val, -static_cast(_Digits)); + } else { + // TRANSITION, the pre-P0952R2 algorithm, for non-integral tr1 machinery only (e.g. subtract_with_carry_01) const _Real _Gxmin = static_cast<_Real>((_Gx.min)()); const _Real _Gxmax = static_cast<_Real>((_Gx.max)()); const _Real _Rx = (_Gxmax - _Gxmin) + _Real{1}; - const int _Ceil = static_cast(_STD ceil(static_cast<_Real>(_Minbits) / _STD log2(_Rx))); + const int _Ceil = static_cast(_STD ceil(static_cast<_Real>(_Digits) / _STD log2(_Rx))); const int _Kx = _Ceil < 1 ? 1 : _Ceil; _Real _Ans{0}; @@ -1968,8 +2080,8 @@ public: explicit _Rng_from_urng_v2(_Urng& _Func) noexcept : _Ref(_Func) {} _Diff operator()(_Diff _Index) { // adapt _Urng closed range to [0, _Index) - // From Daniel Lemire, "Fast Random Integer Generation in an Interval", ACM Trans. Model. Comput. Simul. 29 (1), - // 2019. + // From Daniel Lemire, "Fast Random Integer Generation in an Interval", + // ACM Trans. Model. Comput. Simul. 29 (1), 2019. // // Algorithm 5 <-> This Code: // m <-> _Product @@ -2925,8 +3037,8 @@ public: private: template - result_type _Eval( - _Engine& _Eng, const param_type& _Par0) const { // Press et al., Numerical Recipes in C, 2nd ed., pp 295-296. + result_type _Eval(_Engine& _Eng, const param_type& _Par0) const { + // Press et al., Numerical Recipes in C, 2nd ed., pp 295-296. _Ty _Res; if (_Par0._Tx < 25) { // generate directly _Res = 0; diff --git a/stl/inc/yvals_core.h b/stl/inc/yvals_core.h index 8a0e7d29d1..a840302fba 100644 --- a/stl/inc/yvals_core.h +++ b/stl/inc/yvals_core.h @@ -62,6 +62,7 @@ // P0883R2 Fixing Atomic Initialization // P0935R0 Eradicating Unnecessarily Explicit Default Constructors // P0941R2 Feature-Test Macros +// P0952R2 A New Specification For generate_canonical() // P0972R0 noexcept For zero(), min(), max() // P1065R2 constexpr INVOKE // (the std::invoke function only; other components like bind and reference_wrapper are C++20 only) diff --git a/tests/libcxx/expected_results.txt b/tests/libcxx/expected_results.txt index b3f0faf60b..120b14e2fa 100644 --- a/tests/libcxx/expected_results.txt +++ b/tests/libcxx/expected_results.txt @@ -67,6 +67,9 @@ std/containers/unord/unord.set/insert_and_emplace_allocator_requirements.pass.cp # Bogus test believes that copyability of array must be the same as array std/containers/sequences/array/array.cons/implicit_copy.pass.cpp FAIL +# libc++ hasn't implemented P0952R2, which changes the generate_canonical algorithm. +std/numerics/rand/rand.util/rand.util.canonical/generate_canonical.pass.cpp FAIL + # Test expects __cpp_lib_chrono to have the old value 201611L for P0505R0; we define the C++20 value 201907L for P1466R3. std/language.support/support.limits/support.limits.general/chrono.version.compile.pass.cpp FAIL diff --git a/tests/std/test.lst b/tests/std/test.lst index 3b22ab147a..97c4083505 100644 --- a/tests/std/test.lst +++ b/tests/std/test.lst @@ -198,7 +198,6 @@ tests\GH_001850_clog_tied_to_cout tests\GH_001858_iostream_exception tests\GH_001912_random_distribution_operator_const tests\GH_001914_cached_position -tests\GH_001964_constexpr_generate_canonical tests\GH_002030_asan_annotate_string tests\GH_002030_asan_annotate_vector tests\GH_002039_byte_is_not_trivially_swappable @@ -511,6 +510,7 @@ tests\P0898R3_identity tests\P0912R5_coroutine tests\P0919R3_heterogeneous_unordered_lookup tests\P0943R6_stdatomic_h +tests\P0952R2_new_generate_canonical tests\P0966R1_string_reserve_should_not_shrink tests\P0980R1_constexpr_strings tests\P1004R2_constexpr_vector diff --git a/tests/std/tests/GH_001017_discrete_distribution_out_of_range/env.lst b/tests/std/tests/GH_001017_discrete_distribution_out_of_range/env.lst index 19f025bd0e..8b6c540b4b 100644 --- a/tests/std/tests/GH_001017_discrete_distribution_out_of_range/env.lst +++ b/tests/std/tests/GH_001017_discrete_distribution_out_of_range/env.lst @@ -1,4 +1,5 @@ # Copyright (c) Microsoft Corporation. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -RUNALL_INCLUDE ..\usual_matrix.lst +# Skip /clr:pure configurations; they pass but take an extremely long time to run, risking test timeouts. +RUNALL_INCLUDE ..\impure_matrix.lst diff --git a/tests/std/tests/GH_001964_constexpr_generate_canonical/test.cpp b/tests/std/tests/GH_001964_constexpr_generate_canonical/test.cpp deleted file mode 100644 index b8d40cb9f1..0000000000 --- a/tests/std/tests/GH_001964_constexpr_generate_canonical/test.cpp +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include -#include -#include -#include -#include - -using namespace std; - -#define STATIC_ASSERT(...) static_assert(__VA_ARGS__, #__VA_ARGS__) - -int naive_iterations(const int bits, const uint64_t gmin, const uint64_t gmax) { - // Naive implementation of [rand.util.canonical]. Note that for large values of range, it's possible that - // log2(range) == bits when range < 2^bits. This can lead to incorrect results, so we can't use this function as - // a reference for all values. - - const double range = static_cast(gmax) - static_cast(gmin) + 1.0; - return max(1, static_cast(ceil(static_cast(bits) / log2(range)))); -} - -void test(const int target_bits) { - // Increase the range until the number of iterations repeats. - uint64_t range = 2; - int k = 0; - int prev_k = -1; - while (k != prev_k) { - prev_k = exchange(k, naive_iterations(target_bits, 1, range)); - const int k1 = _Generate_canonical_iterations(target_bits, 1, range); - assert(k == k1); - ++range; - } - - // Now only check the crossover points, where incrementing the range actually causes the number of iterations to - // increase. - while (--k != 0) { - // The largest range such that k iterations generating [1,range] produces less than target_bits bits. - if (k == 1) { - range = ~uint64_t{0} >> (64 - target_bits); - } else { - range = static_cast(ceil(pow(2.0, static_cast(target_bits) / k))) - 1; - } - - int k0 = (k == 1) ? 2 : naive_iterations(target_bits, 1, range); - int k1 = _Generate_canonical_iterations(target_bits, 1, range); - assert(k0 == k1); - assert(k1 == k + 1); - - k0 = (k == 1) ? 1 : naive_iterations(target_bits, 0, range); - k1 = _Generate_canonical_iterations(target_bits, 0, range); - assert(k0 == k1); - assert(k1 == k); - } -} - -int main() { - STATIC_ASSERT(_Generate_canonical_iterations(53, 1, uint64_t{1} << 32) == 2); - STATIC_ASSERT(_Generate_canonical_iterations(64, 0, ~uint64_t{0}) == 1); - - for (int bits = 0; bits <= 64; ++bits) { - test(bits); - } -} diff --git a/tests/std/tests/GH_001964_constexpr_generate_canonical/env.lst b/tests/std/tests/P0952R2_new_generate_canonical/env.lst similarity index 100% rename from tests/std/tests/GH_001964_constexpr_generate_canonical/env.lst rename to tests/std/tests/P0952R2_new_generate_canonical/env.lst diff --git a/tests/std/tests/P0952R2_new_generate_canonical/test.cpp b/tests/std/tests/P0952R2_new_generate_canonical/test.cpp new file mode 100644 index 0000000000..187d202a28 --- /dev/null +++ b/tests/std/tests/P0952R2_new_generate_canonical/test.cpp @@ -0,0 +1,206 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#define _SILENCE_TR1_RANDOM_DEPRECATION_WARNING + +#include <__msvc_int128.hpp> +#include +#include +#include +#include +#include + +using namespace std; + +void test_lwg2524() { + mt19937_64 mt2(1); + mt2.discard(517517); + assert((generate_canonical) (mt2) < 1.0f); + assert((generate_canonical) (mt2) < 1.0f); + assert((generate_canonical) (mt2) < 1.0f); +} + +void test_tr1_16() { + using E_std = linear_congruential_engine; + using E_tr1 = linear_congruential; + + E_std std_eng; + E_tr1 tr1_eng; + + for (int i = 0; i < 10; ++i) { + assert(std_eng() == tr1_eng()); + } + + std_eng.seed(); + tr1_eng.seed(); + + uniform_real_distribution dist_32; + for (int i = 0; i < 10; ++i) { + assert(dist_32(std_eng) == dist_32(tr1_eng)); + } + + uniform_real_distribution dist_64; + for (int i = 0; i < 10; ++i) { + assert(dist_64(std_eng) == dist_64(tr1_eng)); + } +} + +void test_tr1_32() { + using E_std = minstd_rand; + using E_tr1 = linear_congruential; + + E_std std_eng; + E_tr1 tr1_eng; + + for (int i = 0; i < 10; ++i) { + assert(std_eng() == tr1_eng()); + } + + std_eng.seed(); + tr1_eng.seed(); + + uniform_real_distribution dist_32; + for (int i = 0; i < 10; ++i) { + assert(dist_32(std_eng) == dist_32(tr1_eng)); + } + + uniform_real_distribution dist_64; + for (int i = 0; i < 10; ++i) { + assert(dist_64(std_eng) == dist_64(tr1_eng)); + } +} + +void test_tr1_64() { + using E_std = mt19937_64; + using E_tr1 = mersenne_twister; + + E_std std_eng; + E_tr1 tr1_eng(E_tr1::default_seed, 0x5555555555555555ULL, 6364136223846793005ULL); + + std_eng.seed(); + tr1_eng.seed(E_tr1::default_seed, 6364136223846793005ULL); + + for (int i = 0; i < 10; ++i) { + assert(std_eng() == tr1_eng()); + } + + std_eng.seed(); + tr1_eng.seed(E_tr1::default_seed, 6364136223846793005ULL); + + uniform_real_distribution dist_32; + for (int i = 0; i < 10; ++i) { + assert(dist_32(std_eng) == dist_32(tr1_eng)); + } + + uniform_real_distribution dist_64; + for (int i = 0; i < 10; ++i) { + assert(dist_64(std_eng) == dist_64(tr1_eng)); + } +} + +template +Real generate_with_ibe() { + independent_bits_engine ibe; + return generate_canonical(ibe); +} + +int main() { + { + // edge case, Bits == 0 + using Engine = ranlux24; + Engine eng; + assert((generate_canonical) (eng) == 0.0); + assert(eng() == Engine{}()); + } + + { + // float, URBG range is a power of two + using Engine = ranlux24; + constexpr uint32_t values[] = {0xE57B2C, 0xF91555, 0xD9F2DE}; + Engine eng; + for (const auto& value : values) { + assert(eng() == value); + } + + const auto expected1 = ldexpf(0xE57B2C >> (1 * 24 - 24), -24); + const auto expected2 = ldexpf(0x91555'57B2C >> (2 * 20 - 24), -24); + const auto expected3 = ldexpf(0xDE'55'2C >> (3 * 8 - 24), -24); + + assert((generate_with_ibe) () == expected1); + assert((generate_with_ibe) () == expected2); // needs a 64 bit accumulator for $S$ + assert((generate_with_ibe) () == expected3); + } + + { + // double, URBG range is a power of two + using Engine = mt19937_64; + constexpr uint64_t values[] = {0xC96D191C'F6'F6AEA6, 0x401F7AC7'8B'C80F1C, 0xB5EE8CB6AB'E457F8}; + Engine eng; + for (const auto& value : values) { + assert(eng() == value); + } + + const auto expected1 = ldexp(0xC96D191C'F6F'6AEA6 >> (1 * 64 - 53), -53); + const auto expected2 = ldexp(0x8B'C80F1C'F6'F6AEA6 >> (2 * 32 - 53), -53); + const auto expected3 = + ldexp(static_cast(_Unsigned128{0x57F8'C80F1C'F6AEA6, 0xE4} >> (3 * 24 - 53)), -53); + + + assert((generate_with_ibe) () == expected1); + assert((generate_with_ibe) () == expected2); + assert((generate_with_ibe) () == expected3); // needs a 128 bit accumulator for $S$ + } + + { + // $k \in \{1,2\}$, URBG range is NOT a power of two + using Engine = minstd_rand; + constexpr uint32_t values[] = {48271 - 1, 182605794 - 1}; // minstd_rand::min() == 1 + Engine eng; + for (const auto& value : values) { + assert(eng() - Engine::min() == value); + } + + constexpr uint64_t range = Engine::max() - Engine::min() + 1; + constexpr auto x1 = range / (1 << 24); + constexpr auto x2 = (range * range) / (1ULL << 53); + const float expected1 = ldexpf(static_cast(values[0] / x1), -24); + const double expected2 = ldexp((values[0] + range * values[1]) / x2, -53); + + // $k$ == 1 + eng.seed(); + assert((generate_canonical) (eng) == expected1); + + // $k$ == 2 + eng.seed(); + assert((generate_canonical) (eng) == expected2); + } + + { + // $k = 4, URBG range is NOT a power of two, 128-bit accumulator needed + + // ZX81 generator, R = 65537, https://oeis.org/A357907 + using Engine = linear_congruential_engine; + constexpr uint32_t values[] = {149, 11249, 57305, 38044}; + Engine eng; + for (const auto& value : values) { + assert(eng() - Engine::min() == value); + } + + constexpr _Unsigned128 range = Engine::max() - Engine::min() + 1; + constexpr auto x = static_cast((range * range * range * range) / (1ULL << 53)); + const auto expected = ldexp( + static_cast((values[0] + range * (values[1] + range * (values[2] + range * values[3]))) / x), + -53); + + eng.seed(); + assert((generate_canonical) (eng) == expected); + } + + test_tr1_16(); + test_tr1_32(); + test_tr1_64(); + test_lwg2524(); + + return 0; +}