From 3729e3fdfa5df6072e1b2d0dcea755d7e977ad58 Mon Sep 17 00:00:00 2001 From: Joao Paulo Magalhaes Date: Sat, 20 Aug 2022 17:15:19 +0100 Subject: [PATCH] v0.1.10 --- CMakeLists.txt | 2 +- changelog/0.1.10.md | 106 ++++++++++++++++++++++++++ changelog/current.md | 106 -------------------------- tbump.toml | 2 +- test/test_install/CMakeLists.txt | 2 +- test/test_singleheader/CMakeLists.txt | 2 +- 6 files changed, 110 insertions(+), 110 deletions(-) create mode 100644 changelog/0.1.10.md diff --git a/CMakeLists.txt b/CMakeLists.txt index e9412ded..b2450e10 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ project(c4core LANGUAGES CXX) include(./compat.cmake) -c4_project(VERSION 0.1.9 +c4_project(VERSION 0.1.10 AUTHOR "Joao Paulo Magalhaes ") option(C4CORE_WITH_FASTFLOAT "use fastfloat to parse floats" ON) diff --git a/changelog/0.1.10.md b/changelog/0.1.10.md new file mode 100644 index 00000000..ce95aaee --- /dev/null +++ b/changelog/0.1.10.md @@ -0,0 +1,106 @@ +### Changes + +Improved the performance of `c4/charconv.hpp` functions ([PR#77](https://github.com/biojppm/c4core/pull/77)): + - Added `digits_dec/hex/oct/bin()`. + - Optimized `write_dec/hex/oct/bin()`: + - these functions now return immediately without entering the loop if the output buffer is smaller than respectively `digits_dec/hex/oct/bin()`. This enables both: + - writing every character in its final position without having to revert the string at the end + - the need to check the buffer size on appending every character. + - `write_dec()` now writes two digits at once, thus halving the number of integer divisions. + - Added `write_dec/hex/oct/bin_unchecked()`, which receive precomputed `digits_dec/hex/oct/bin()`, thus speeding up the radix `itoa()/utoa()` overloads. + - Added `xtoa()` radix+digits overloads: + - `size_t xtoa(substr s, T v, T radix)` + - `size_t xtoa(substr s, T v, T radix, size_t num_digits)` + - `read_dec/hex/oct/bin()`: these functions no longer allow an empty input buffer. + - Use intrinsic functions `__builtin_clz()` (gcc) / `_BitScanReverse()` (msvc) in `c4::msb()` and `__builtin_ctz()` (gcc) / `_BitScanForward()` (msvc) in `c4::lsb()` when they are available. `msb()` is used by `digits_hex()/digits_bin()`. + - Refactored the charconv tests to improve consistency and thoroughness. + - Improved the charconv benchmarks to ensure full consistency across benchmarks. + - Special thanks and kudos to @fargies for being attentive and pinpointing several issues throughout the PR! + - Finding the best approach involved [writing a R&D benchmark for the several algorithm components](https://github.com/biojppm/c4core/tree/master/bm/bm_xtoa.cpp). This benchmark is disabled by default, and can be enabled with the flag `C4CORE_BM_XTOA_RND`. + - With the changes from this PR, the [charconv benchmark](https://github.com/biojppm/c4core/tree/master/bm_charconv.cpp) results show that on Linux/g++11.2, with integral types: + - `c4::to_chars()` can be expected to be roughly... + - ~40% to 2x faster than `std::to_chars()` + - ~10x-30x faster than `sprintf()` + - ~50x-100x faster than a naive `stringstream::operator<<()` followed by `stringstream::str()` + - `c4::from_chars()` can be expected to be roughly... + - ~10%-30% faster than `std::from_chars()` + - ~10x faster than `scanf()` + - ~30x-50x faster than a naive `stringstream::str()` followed by `stringstream::operator>>()` + - Here are the results from the run: + | Write throughput | | Read throughput | | + |:-------------------------|--------:|:-------------------------|---------:| + | **write `uint8_t`** | **MB/s**| **read `uint8_t`** | **MB/s**| + | `c4::to_chars` | 526.86 | `c4::from_chars` | 163.06 | + | `std::to_chars` | 379.03 | `std::from_chars` | 154.85 | + | `std::sprintf` | 20.49 | `std::scanf` | 15.75 | + | `std::stringstream` | 3.82 | `std::stringstream` | 3.83 | + | **write `int8_t`** | **MB/s**| **read `int8_t`** | **MB/s**| + | `c4::to_chars` | 599.98 | `c4::from_chars` | 184.20 | + | `std::to_chars` | 246.32 | `std::from_chars` | 156.40 | + | `std::sprintf` | 19.15 | `std::scanf` | 16.44 | + | `std::stringstream` | 3.83 | `std::stringstream` | 3.89 | + | **write `uint16_t`** | **MB/s**| **read `uint16_t`** | **MB/s**| + | `c4::to_chars` | 486.40 | `c4::from_chars` | 349.48 | + | `std::to_chars` | 454.24 | `std::from_chars` | 319.13 | + | `std::sprintf` | 38.74 | `std::scanf` | 28.12 | + | `std::stringstream` | 7.08 | `std::stringstream`| 6.73 | + | **write `int16_t`** | **MB/s**| **read `int16_t`** | **MB/s**| + | `c4::to_chars` | 507.44 | `c4::from_chars` | 282.95 | + | `std::to_chars` | 297.49 | `std::from_chars` | 186.18 | + | `std::sprintf` | 39.03 | `std::scanf` | 28.45 | + | `std::stringstream` | 6.98 | `std::stringstream`| 6.49 | + | **write `uint32_t`** | **MB/s**| **read `uint32_t`** | **MB/s**| + | `c4::to_chars` | 730.12 | `c4::from_chars` | 463.95 | + | `std::to_chars` | 514.76 | `std::from_chars` | 329.42 | + | `std::sprintf` | 71.19 | `std::scanf` | 44.97 | + | `std::stringstream` | 14.05 | `std::stringstream`| 12.57 | + | **write `int32_t`** | **MB/s**| **read `int32_t`** | **MB/s**| + | `c4::to_chars` | 618.76 | `c4::from_chars` | 345.53 | + | `std::to_chars` | 394.72 | `std::from_chars` | 224.46 | + | `std::sprintf` | 71.14 | `std::scanf` | 43.49 | + | `std::stringstream` | 13.91 | `std::stringstream`| 12.03 | + | **write `uint64_t`** | **MB/s**| **read `uint64_t`** | **MB/s**| + | `c4::to_chars` | 1118.87 | `c4::from_chars` | 928.49 | + | `std::to_chars` | 886.58 | `std::from_chars` | 759.03 | + | `std::sprintf` | 140.96 | `std::scanf` | 91.60 | + | `std::stringstream` | 28.01 | `std::stringstream`| 25.00 | + | **write `int64_t`** | **MB/s**| **read `int64_t`** | **MB/s**| + | `c4::to_chars` | 1198.78 | `c4::from_chars` | 713.76 | + | `std::to_chars` | 882.17 | `std::from_chars` | 646.18 | + | `std::sprintf` | 138.79 | `std::scanf` | 90.07 | + | `std::stringstream` | 27.62 | `std::stringstream`| 25.12 | + +If you feel suspicious about these bold claims, you can browse through [c4core's CI benchmark results](https://github.com/biojppm/c4core/actions/workflows/benchmarks.yml) which will hopefully give these more substance. + + +### New features + +- Added `bool c4::overflows(csubstr s)` for detecting whether a string overflows a given integral type. See [PR#78](https://github.com/biojppm/c4core/pull/78). + - Also, added `c4::fmt::overflow_checked()` (and the corresponding `from_chars()` overload) to enable a check for overflow before parsing from string: + ```c++ + c4::from_chars(str, &val); // no overflow check + c4::from_chars(str, c4::fmt::overflow_checked(val)); // enable overflow check + // as an example, the implementation looks like: + template + bool c4::from_chars(c4::csubstr str, c4::fmt::overflow_checked oc) + { + if(overflows(str)) + return false; + return c4::from_chars(str, oc.val); + } + ``` + +### Fixes + +- Fix missing endianess macro on windows arm/arm64 compilations [PR #76](https://github.com/biojppm/c4core/pull/76) +- Add missing `#define` for the include guard of the amalgamated header (see [rapidyaml#246](https://github.com/biojppm/rapidyaml/issues/246)). +- Fix CPU detection with ARMEL [PR #86](https://github.com/biojppm/c4core/pull/86). +- Fix GCC version detection [PR #87](https://github.com/biojppm/c4core/pull/87). +- Fix [cmake#8](https://github.com/biojppm/cmake/issues/8): `SOVERSION` missing from shared libraries. +- Update fastfloat to 3.5.1. + +### Thanks + +- @fargies +- @daichifukui +- @janisozaur diff --git a/changelog/current.md b/changelog/current.md index ce95aaee..e69de29b 100644 --- a/changelog/current.md +++ b/changelog/current.md @@ -1,106 +0,0 @@ -### Changes - -Improved the performance of `c4/charconv.hpp` functions ([PR#77](https://github.com/biojppm/c4core/pull/77)): - - Added `digits_dec/hex/oct/bin()`. - - Optimized `write_dec/hex/oct/bin()`: - - these functions now return immediately without entering the loop if the output buffer is smaller than respectively `digits_dec/hex/oct/bin()`. This enables both: - - writing every character in its final position without having to revert the string at the end - - the need to check the buffer size on appending every character. - - `write_dec()` now writes two digits at once, thus halving the number of integer divisions. - - Added `write_dec/hex/oct/bin_unchecked()`, which receive precomputed `digits_dec/hex/oct/bin()`, thus speeding up the radix `itoa()/utoa()` overloads. - - Added `xtoa()` radix+digits overloads: - - `size_t xtoa(substr s, T v, T radix)` - - `size_t xtoa(substr s, T v, T radix, size_t num_digits)` - - `read_dec/hex/oct/bin()`: these functions no longer allow an empty input buffer. - - Use intrinsic functions `__builtin_clz()` (gcc) / `_BitScanReverse()` (msvc) in `c4::msb()` and `__builtin_ctz()` (gcc) / `_BitScanForward()` (msvc) in `c4::lsb()` when they are available. `msb()` is used by `digits_hex()/digits_bin()`. - - Refactored the charconv tests to improve consistency and thoroughness. - - Improved the charconv benchmarks to ensure full consistency across benchmarks. - - Special thanks and kudos to @fargies for being attentive and pinpointing several issues throughout the PR! - - Finding the best approach involved [writing a R&D benchmark for the several algorithm components](https://github.com/biojppm/c4core/tree/master/bm/bm_xtoa.cpp). This benchmark is disabled by default, and can be enabled with the flag `C4CORE_BM_XTOA_RND`. - - With the changes from this PR, the [charconv benchmark](https://github.com/biojppm/c4core/tree/master/bm_charconv.cpp) results show that on Linux/g++11.2, with integral types: - - `c4::to_chars()` can be expected to be roughly... - - ~40% to 2x faster than `std::to_chars()` - - ~10x-30x faster than `sprintf()` - - ~50x-100x faster than a naive `stringstream::operator<<()` followed by `stringstream::str()` - - `c4::from_chars()` can be expected to be roughly... - - ~10%-30% faster than `std::from_chars()` - - ~10x faster than `scanf()` - - ~30x-50x faster than a naive `stringstream::str()` followed by `stringstream::operator>>()` - - Here are the results from the run: - | Write throughput | | Read throughput | | - |:-------------------------|--------:|:-------------------------|---------:| - | **write `uint8_t`** | **MB/s**| **read `uint8_t`** | **MB/s**| - | `c4::to_chars` | 526.86 | `c4::from_chars` | 163.06 | - | `std::to_chars` | 379.03 | `std::from_chars` | 154.85 | - | `std::sprintf` | 20.49 | `std::scanf` | 15.75 | - | `std::stringstream` | 3.82 | `std::stringstream` | 3.83 | - | **write `int8_t`** | **MB/s**| **read `int8_t`** | **MB/s**| - | `c4::to_chars` | 599.98 | `c4::from_chars` | 184.20 | - | `std::to_chars` | 246.32 | `std::from_chars` | 156.40 | - | `std::sprintf` | 19.15 | `std::scanf` | 16.44 | - | `std::stringstream` | 3.83 | `std::stringstream` | 3.89 | - | **write `uint16_t`** | **MB/s**| **read `uint16_t`** | **MB/s**| - | `c4::to_chars` | 486.40 | `c4::from_chars` | 349.48 | - | `std::to_chars` | 454.24 | `std::from_chars` | 319.13 | - | `std::sprintf` | 38.74 | `std::scanf` | 28.12 | - | `std::stringstream` | 7.08 | `std::stringstream`| 6.73 | - | **write `int16_t`** | **MB/s**| **read `int16_t`** | **MB/s**| - | `c4::to_chars` | 507.44 | `c4::from_chars` | 282.95 | - | `std::to_chars` | 297.49 | `std::from_chars` | 186.18 | - | `std::sprintf` | 39.03 | `std::scanf` | 28.45 | - | `std::stringstream` | 6.98 | `std::stringstream`| 6.49 | - | **write `uint32_t`** | **MB/s**| **read `uint32_t`** | **MB/s**| - | `c4::to_chars` | 730.12 | `c4::from_chars` | 463.95 | - | `std::to_chars` | 514.76 | `std::from_chars` | 329.42 | - | `std::sprintf` | 71.19 | `std::scanf` | 44.97 | - | `std::stringstream` | 14.05 | `std::stringstream`| 12.57 | - | **write `int32_t`** | **MB/s**| **read `int32_t`** | **MB/s**| - | `c4::to_chars` | 618.76 | `c4::from_chars` | 345.53 | - | `std::to_chars` | 394.72 | `std::from_chars` | 224.46 | - | `std::sprintf` | 71.14 | `std::scanf` | 43.49 | - | `std::stringstream` | 13.91 | `std::stringstream`| 12.03 | - | **write `uint64_t`** | **MB/s**| **read `uint64_t`** | **MB/s**| - | `c4::to_chars` | 1118.87 | `c4::from_chars` | 928.49 | - | `std::to_chars` | 886.58 | `std::from_chars` | 759.03 | - | `std::sprintf` | 140.96 | `std::scanf` | 91.60 | - | `std::stringstream` | 28.01 | `std::stringstream`| 25.00 | - | **write `int64_t`** | **MB/s**| **read `int64_t`** | **MB/s**| - | `c4::to_chars` | 1198.78 | `c4::from_chars` | 713.76 | - | `std::to_chars` | 882.17 | `std::from_chars` | 646.18 | - | `std::sprintf` | 138.79 | `std::scanf` | 90.07 | - | `std::stringstream` | 27.62 | `std::stringstream`| 25.12 | - -If you feel suspicious about these bold claims, you can browse through [c4core's CI benchmark results](https://github.com/biojppm/c4core/actions/workflows/benchmarks.yml) which will hopefully give these more substance. - - -### New features - -- Added `bool c4::overflows(csubstr s)` for detecting whether a string overflows a given integral type. See [PR#78](https://github.com/biojppm/c4core/pull/78). - - Also, added `c4::fmt::overflow_checked()` (and the corresponding `from_chars()` overload) to enable a check for overflow before parsing from string: - ```c++ - c4::from_chars(str, &val); // no overflow check - c4::from_chars(str, c4::fmt::overflow_checked(val)); // enable overflow check - // as an example, the implementation looks like: - template - bool c4::from_chars(c4::csubstr str, c4::fmt::overflow_checked oc) - { - if(overflows(str)) - return false; - return c4::from_chars(str, oc.val); - } - ``` - -### Fixes - -- Fix missing endianess macro on windows arm/arm64 compilations [PR #76](https://github.com/biojppm/c4core/pull/76) -- Add missing `#define` for the include guard of the amalgamated header (see [rapidyaml#246](https://github.com/biojppm/rapidyaml/issues/246)). -- Fix CPU detection with ARMEL [PR #86](https://github.com/biojppm/c4core/pull/86). -- Fix GCC version detection [PR #87](https://github.com/biojppm/c4core/pull/87). -- Fix [cmake#8](https://github.com/biojppm/cmake/issues/8): `SOVERSION` missing from shared libraries. -- Update fastfloat to 3.5.1. - -### Thanks - -- @fargies -- @daichifukui -- @janisozaur diff --git a/tbump.toml b/tbump.toml index b630e3b3..fdc06aef 100644 --- a/tbump.toml +++ b/tbump.toml @@ -2,7 +2,7 @@ # github_url = "https://github.com///" [version] -current = "0.1.9" +current = "0.1.10" # Example of a semver regexp. # Make sure this matches current_version before diff --git a/test/test_install/CMakeLists.txt b/test/test_install/CMakeLists.txt index ddc82fe4..01cc7a9c 100644 --- a/test/test_install/CMakeLists.txt +++ b/test/test_install/CMakeLists.txt @@ -4,7 +4,7 @@ project(c4core HOMEPAGE_URL "https://github.com/biojppm/c4core" LANGUAGES CXX) include(../../cmake/c4Project.cmake) -c4_project(VERSION 0.1.9 +c4_project(VERSION 0.1.10 AUTHOR "Joao Paulo Magalhaes ") if(C4CORE_TEST_INSTALL_PACKAGE_MODE) diff --git a/test/test_singleheader/CMakeLists.txt b/test/test_singleheader/CMakeLists.txt index ca598059..7b0d63d0 100644 --- a/test/test_singleheader/CMakeLists.txt +++ b/test/test_singleheader/CMakeLists.txt @@ -4,7 +4,7 @@ project(c4core HOMEPAGE_URL "https://github.com/biojppm/c4core" LANGUAGES CXX) include(../../cmake/c4Project.cmake) -c4_project(VERSION 0.1.9 +c4_project(VERSION 0.1.10 AUTHOR "Joao Paulo Magalhaes ") # amalgamate c4core to get the single header