diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 3d63bc07..f0079b2f 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -1,16 +1,31 @@
 name: CI
 on: [push, pull_request]
 
+env:
+  RUSTDOCFLAGS: -Dwarnings
+  RUSTFLAGS: -Dwarnings
+  RUST_LLVM_VERSION: 19.1-2024-09-17
+  RUST_COMPILER_RT_ROOT: ./compiler-rt
+
 jobs:
   test:
     name: Test
     runs-on: ${{ matrix.os }}
     strategy:
+      fail-fast: false
       matrix:
         include:
+        - target: aarch64-apple-darwin
+          os: macos-latest
+          rust: nightly
         - target: aarch64-unknown-linux-gnu
           os: ubuntu-latest
           rust: nightly
+        - target: aarch64-pc-windows-msvc
+          os: windows-latest
+          rust: nightly
+          test_verbatim: 1
+          no_std: 1
         - target: arm-unknown-linux-gnueabi
           os: ubuntu-latest
           rust: nightly
@@ -23,25 +38,33 @@ jobs:
         - target: i686-unknown-linux-gnu
           os: ubuntu-latest
           rust: nightly
-        - target: mips-unknown-linux-gnu
-          os: ubuntu-latest
-          rust: nightly
-        - target: mips64-unknown-linux-gnuabi64
-          os: ubuntu-latest
-          rust: nightly
-        - target: mips64el-unknown-linux-gnuabi64
+        # MIPS targets disabled since they are dropped to tier 3.
+        # See https://github.com/rust-lang/compiler-team/issues/648
+        #- target: mips-unknown-linux-gnu
+        #  os: ubuntu-latest
+        #  rust: nightly
+        #- target: mips64-unknown-linux-gnuabi64
+        #  os: ubuntu-latest
+        #  rust: nightly
+        #- target: mips64el-unknown-linux-gnuabi64
+        #  os: ubuntu-latest
+        #  rust: nightly
+        #- target: mipsel-unknown-linux-gnu
+        #  os: ubuntu-latest
+        #  rust: nightly
+        - target: powerpc-unknown-linux-gnu
           os: ubuntu-latest
           rust: nightly
-        - target: mipsel-unknown-linux-gnu
+        - target: powerpc64-unknown-linux-gnu
           os: ubuntu-latest
           rust: nightly
-        - target: powerpc-unknown-linux-gnu
+        - target: powerpc64le-unknown-linux-gnu
           os: ubuntu-latest
           rust: nightly
-        - target: powerpc64-unknown-linux-gnu
+        - target: riscv64gc-unknown-linux-gnu
           os: ubuntu-latest
           rust: nightly
-        - target: powerpc64le-unknown-linux-gnu
+        - target: sbf-solana-solana
           os: ubuntu-latest
           rust: nightly
         - target: thumbv6m-none-eabi
@@ -63,14 +86,16 @@ jobs:
           os: ubuntu-latest
           rust: nightly
         - target: x86_64-apple-darwin
-          os: macos-latest
+          os: macos-13
           rust: nightly
         - target: i686-pc-windows-msvc
           os: windows-latest
           rust: nightly
+          test_verbatim: 1
         - target: x86_64-pc-windows-msvc
           os: windows-latest
           rust: nightly
+          test_verbatim: 1
         - target: i686-pc-windows-gnu
           os: windows-latest
           rust: nightly-i686-gnu
@@ -78,35 +103,71 @@ jobs:
           os: windows-latest
           rust: nightly-x86_64-gnu
     steps:
-    - uses: actions/checkout@v1
+    - name: Print runner information
+      run: uname -a
+    - uses: actions/checkout@v4
       with:
         submodules: true
     - name: Install Rust (rustup)
       run: rustup update ${{ matrix.rust }} --no-self-update && rustup default ${{ matrix.rust }}
       shell: bash
     - run: rustup target add ${{ matrix.target }}
+      if: matrix.target != 'sbf-solana-solana'
     - run: rustup component add llvm-tools-preview
+    - uses: Swatinem/rust-cache@v2
+      with:
+        key: ${{ matrix.target }}
+    - name: Cache Docker layers
+      uses: actions/cache@v2
+      if: matrix.os == 'ubuntu-latest'
+      with:
+        path: /tmp/.buildx-cache
+        key: ${{ matrix.target }}-buildx-${{ github.sha }}
+        restore-keys: ${{ matrix.target }}-buildx-
+        
+    - name: Cache compiler-rt
+      id: cache-compiler-rt
+      uses: actions/cache@v4
+      with:
+        path: compiler-rt
+        key: ${{ runner.os }}-compiler-rt-${{ env.RUST_LLVM_VERSION }}
     - name: Download compiler-rt reference sources
+      if: steps.cache-compiler-rt.outputs.cache-hit != 'true'
       run: |
-        curl -L -o code.tar.gz https://github.com/rust-lang/llvm-project/archive/rustc/12.0-2021-04-15.tar.gz
-        tar xzf code.tar.gz --strip-components 1 llvm-project-rustc-12.0-2021-04-15/compiler-rt
-        echo RUST_COMPILER_RT_ROOT=./compiler-rt >> $GITHUB_ENV
+        curl -L -o code.tar.gz "https://github.com/rust-lang/llvm-project/archive/rustc/${RUST_LLVM_VERSION}.tar.gz"
+        tar xzf code.tar.gz --strip-components 1 llvm-project-rustc-${RUST_LLVM_VERSION}/compiler-rt
       shell: bash
 
     # Non-linux tests just use our raw script
     - run: ./ci/run.sh ${{ matrix.target }}
       if: matrix.os != 'ubuntu-latest'
       shell: bash
+      env:
+        NO_STD: ${{ matrix.no_std }}
+        TEST_VERBATIM: ${{ matrix.test_verbatim }}
+
+    # Configure buildx to use Docker layer caching
+    - uses: docker/setup-buildx-action@v3
+      if: matrix.os == 'ubuntu-latest'
 
     # Otherwise we use our docker containers to run builds
     - run: cargo generate-lockfile && ./ci/run-docker.sh ${{ matrix.target }}
       if: matrix.os == 'ubuntu-latest'
 
+    # Workaround to keep Docker cache smaller
+    # https://github.com/docker/build-push-action/issues/252
+    # https://github.com/moby/buildkit/issues/1896
+    - name: Move Docker cache
+      if: matrix.os == 'ubuntu-latest'
+      run: |
+        rm -rf /tmp/.buildx-cache
+        mv /tmp/.buildx-cache-new /tmp/.buildx-cache
+
   rustfmt:
     name: Rustfmt
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v1
+    - uses: actions/checkout@v4
       with:
         submodules: true
     - name: Install stable `rustfmt`
@@ -117,7 +178,7 @@ jobs:
     name: Clippy
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v1
+    - uses: actions/checkout@v4
       with:
         submodules: true
     # Unlike rustfmt, stable clippy does not work on code with nightly features.
@@ -125,4 +186,20 @@ jobs:
     - name: Install nightly `clippy`
       run: |
         rustup set profile minimal && rustup default "nightly-$(curl -s https://rust-lang.github.io/rustup-components-history/x86_64-unknown-linux-gnu/clippy)" && rustup component add clippy
+    - uses: Swatinem/rust-cache@v2
     - run: cargo clippy -- -D clippy::all
+
+  success:
+    needs:
+      - test
+      - rustfmt
+      - clippy
+    runs-on: ubuntu-latest
+    # GitHub branch protection is exceedingly silly and treats "jobs skipped because a dependency
+    # failed" as success. So we have to do some contortions to ensure the job fails if any of its
+    # dependencies fails.
+    if: always() # make sure this is never "skipped"
+    steps:
+      # Manually check the status of all dependencies. `if: failure()` does not work.
+      - name: check if any dependency failed
+        run: jq --exit-status 'all(.result == "success")' <<< '${{ toJson(needs) }}'
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
new file mode 100644
index 00000000..d568f375
--- /dev/null
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,30 @@
+name: Release-plz
+
+permissions:
+  pull-requests: write
+  contents: write
+
+on:
+  push:
+    branches:
+      - master
+
+jobs:
+  release-plz:
+    name: Release-plz
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          submodules: true
+      - name: Install Rust (rustup)
+        run: rustup update nightly --no-self-update && rustup default nightly
+      - name: Publish `libm` as part of builtins, rather than its own crate
+        run: rm libm/Cargo.toml
+      - name: Run release-plz
+        uses: MarcoIeni/release-plz-action@v0.5
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
diff --git a/.gitignore b/.gitignore
index b203ea61..97df30ff 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
 *.rs.bk
 Cargo.lock
 target
+compiler-rt
+*.tar.gz
diff --git a/.release-plz.toml b/.release-plz.toml
new file mode 100644
index 00000000..fce19d15
--- /dev/null
+++ b/.release-plz.toml
@@ -0,0 +1,8 @@
+[workspace]
+changelog_update = false
+semver_check = false
+
+# As part of the release process, we delete `libm/Cargo.toml`. Since
+# this is only run in CI, we shouldn't need to worry about it.
+allow_dirty = true
+publish_allow_dirty = true
diff --git a/Cargo.toml b/Cargo.toml
index 52c1ee39..508324e1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,12 +1,13 @@
 [package]
 authors = ["Jorge Aparicio <japaricious@gmail.com>"]
 name = "compiler_builtins"
-version = "0.1.95"
+version = "0.1.133"
 license = "MIT/Apache-2.0"
 readme = "README.md"
 repository = "https://github.com/rust-lang/compiler-builtins"
 homepage = "https://github.com/rust-lang/compiler-builtins"
 documentation = "https://docs.rs/compiler_builtins"
+edition = "2021"
 description = """
 Compiler intrinsics used by the Rust compiler. Also available for other targets
 if necessary!
@@ -14,6 +15,7 @@ if necessary!
 include = [
   '/Cargo.toml',
   '/build.rs',
+  '/configure.rs',
   '/src/*',
   '/examples/*',
   '/LICENSE.txt',
@@ -27,8 +29,8 @@ links = 'compiler-rt'
 test = false
 
 [dependencies]
-# For more information on this dependency see rust-lang/rust's
-# `src/tools/rustc-std-workspace` folder
+# For more information on this dependency see
+# https://github.com/rust-lang/rust/tree/master/library/rustc-std-workspace-core
 core = { version = "1.0.0", optional = true, package = 'rustc-std-workspace-core' }
 
 [build-dependencies]
@@ -48,6 +50,10 @@ c = ["cc"]
 # which use inline assembly and fall back to pure Rust versions (if avalible).
 no-asm = []
 
+# Workaround for codegen backends which haven't yet implemented `f16` and
+# `f128` support. Disabled any intrinsics which use those types.
+no-f16-f128 = []
+
 # Flag this library as the unstable compiler-builtins lib
 compiler-builtins = []
 
@@ -65,17 +71,6 @@ rustc-dep-of-std = ['compiler-builtins', 'core']
 # are not normally public but are required by the `testcrate`
 public-test-deps = []
 
-# Marks all intrinsics functions with weak linkage so that they can be
-# replaced at link time by another implementation. This is particularly useful
-# for mixed Rust/C++ binaries that want to use the C++ intrinsics, otherwise
-# linking against the Rust stdlib will replace those from the compiler-rt
-# library.
-#
-# Unlike the "c" feature, the intrinsics are still provided by the Rust
-# implementations and each will be used unless a stronger symbol replaces
-# it during linking.
-weak-intrinsics = []
-
 [[example]]
 name = "intrinsics"
 required-features = ["compiler-builtins"]
@@ -88,3 +83,6 @@ panic = 'abort'
 
 [profile.dev]
 panic = 'abort'
+
+[lints.rust]
+unexpected_cfgs = { level = "warn", check-cfg = ['cfg(target_family, values("solana"))', 'cfg(target_feature, values("static-syscalls"))', 'cfg(target_os, values("solana"))'] }
diff --git a/README.md b/README.md
index da0adbce..f792d188 100644
--- a/README.md
+++ b/README.md
@@ -16,13 +16,13 @@ you can use this crate to get those intrinsics and solve the linker errors. To
 do that, add this crate somewhere in the dependency graph of the crate you are
 building:
 
-``` toml
+```toml
 # Cargo.toml
 [dependencies]
 compiler_builtins = { git = "https://github.com/rust-lang/compiler-builtins" }
 ```
 
-``` rust
+```rust
 extern crate compiler_builtins;
 
 // ...
@@ -52,17 +52,17 @@ features = ["c"]
 2. Fork this repository.
 3. Port the intrinsic(s) and their corresponding [unit tests][1] from their
    [C implementation][2] to Rust.
-4. Implement a [test generator][3] to compare the behavior of the ported intrinsic(s)
-   with their implementation on the testing host. Note that randomized compiler-builtin tests
-   should be run using `cargo test --features gen-tests`.
-4. Send a Pull Request (PR).
-5. Once the PR passes our extensive [testing infrastructure][4], we'll merge it!
-6. Celebrate :tada:
+4. Add a test to compare the behavior of the ported intrinsic(s) with their
+   implementation on the testing host.
+5. Add the intrinsic to `examples/intrinsics.rs` to verify it can be linked on
+   all targets.
+6. Send a Pull Request (PR).
+7. Once the PR passes our extensive testing infrastructure, we'll merge it!
+8. Celebrate :tada:
 
 [1]: https://github.com/rust-lang/llvm-project/tree/9e3de9490ff580cd484fbfa2908292b4838d56e7/compiler-rt/test/builtins/Unit
 [2]: https://github.com/rust-lang/llvm-project/tree/9e3de9490ff580cd484fbfa2908292b4838d56e7/compiler-rt/lib/builtins
-[3]: https://github.com/rust-lang/compiler-builtins/blob/0ba07e49264a54cb5bbd4856fcea083bb3fbec15/build.rs#L180-L265
-[4]: https://travis-ci.org/rust-lang/compiler-builtins
+[3]: https://github.com/rust-lang/compiler-builtins/actions
 
 ### Porting Reminders
 
@@ -78,12 +78,32 @@ features = ["c"]
 [8]: http://en.cppreference.com/w/cpp/language/implicit_conversion
 [9]: https://doc.rust-lang.org/std/primitive.i32.html
 
+## Testing
+
+The easiest way to test locally is using Docker. This can be done by running
+`./ci/run-docker.sh [target]`. If no target is specified, all targets will be
+run.
+
+In order to run the full test suite, you will also need the C compiler runtime
+to test against, located in a directory called `compiler-rt`. This can be
+obtained with the following:
+
+```sh
+curl -L -o rustc-llvm-19.1.tar.gz https://github.com/rust-lang/llvm-project/archive/rustc/19.1-2024-09-17.tar.gz
+tar xzf rustc-llvm-19.1.tar.gz --strip-components 1 llvm-project-rustc-19.1-2024-09-17/compiler-rt
+```
+
+Local targets may also be tested with `./ci/run.sh [target]`.
+
+Note that testing may not work on all hosts, in which cases it is acceptable to
+rely on CI.
+
 ## Progress
 
+- [x] aarch64/chkstk.S
 - [x] adddf3.c
 - [x] addsf3.c
-- [x] arm/adddf3vfp.S
-- [x] arm/addsf3vfp.S
+- [x] arm/addsf3.S
 - [x] arm/aeabi_dcmp.S
 - [x] arm/aeabi_fcmp.S
 - [x] arm/aeabi_idivmod.S
@@ -93,56 +113,40 @@ features = ["c"]
 - [x] arm/aeabi_memset.S
 - [x] arm/aeabi_uidivmod.S
 - [x] arm/aeabi_uldivmod.S
-- [x] arm/divdf3vfp.S
+- [ ] arm/chkstk.S
 - [ ] arm/divmodsi4.S (generic version is done)
-- [x] arm/divsf3vfp.S
 - [ ] arm/divsi3.S (generic version is done)
-- [x] arm/eqdf2vfp.S
-- [x] arm/eqsf2vfp.S
-- [x] arm/extendsfdf2vfp.S
-- [ ] arm/fixdfsivfp.S
-- [ ] arm/fixsfsivfp.S
-- [ ] arm/fixunsdfsivfp.S
-- [ ] arm/fixunssfsivfp.S
-- [ ] arm/floatsidfvfp.S
-- [ ] arm/floatsisfvfp.S
-- [ ] arm/floatunssidfvfp.S
-- [ ] arm/floatunssisfvfp.S
-- [x] arm/gedf2vfp.S
-- [x] arm/gesf2vfp.S
-- [x] arm/gtdf2vfp.S
-- [x] arm/gtsf2vfp.S
-- [x] arm/ledf2vfp.S
-- [x] arm/lesf2vfp.S
-- [x] arm/ltdf2vfp.S
-- [x] arm/ltsf2vfp.S
 - [ ] arm/modsi3.S (generic version is done)
-- [x] arm/muldf3vfp.S
-- [x] arm/mulsf3vfp.S
-- [x] arm/nedf2vfp.S
-- [ ] arm/negdf2vfp.S
-- [ ] arm/negsf2vfp.S
-- [x] arm/nesf2vfp.S
 - [x] arm/softfloat-alias.list
-- [x] arm/subdf3vfp.S
-- [x] arm/subsf3vfp.S
-- [x] arm/truncdfsf2vfp.S
 - [ ] arm/udivmodsi4.S (generic version is done)
 - [ ] arm/udivsi3.S (generic version is done)
 - [ ] arm/umodsi3.S (generic version is done)
-- [ ] arm/unorddf2vfp.S
-- [ ] arm/unordsf2vfp.S
 - [x] ashldi3.c
 - [x] ashrdi3.c
+- [ ] avr/divmodhi4.S
+- [ ] avr/divmodqi4.S
+- [ ] avr/mulhi3.S
+- [ ] avr/mulqi3.S
+- [ ] avr/udivmodhi4.S
+- [ ] avr/udivmodqi4.S
+- [x] bswapdi2.c
+- [x] bswapsi2.c
+- [x] bswapti2.c
+- [x] clzdi2.c
+- [x] clzsi2.c
+- [x] clzti2.c
 - [x] comparedf2.c
 - [x] comparesf2.c
+- [x] ctzdi2.c
+- [x] ctzsi2.c
+- [x] ctzti2.c
 - [x] divdf3.c
 - [x] divdi3.c
 - [x] divmoddi4.c
 - [x] divmodsi4.c
+- [x] divmodti4.c
 - [x] divsf3.c
 - [x] divsi3.c
-- [ ] extendhfsf2.c
 - [x] extendsfdf2.c
 - [x] fixdfdi.c
 - [x] fixdfsi.c
@@ -163,7 +167,6 @@ features = ["c"]
 - [ ] i386/ashldi3.S
 - [ ] i386/ashrdi3.S
 - [x] i386/chkstk.S
-- [x] i386/chkstk2.S
 - [ ] i386/divdi3.S
 - [ ] i386/lshrdi3.S
 - [ ] i386/moddi3.S
@@ -180,11 +183,11 @@ features = ["c"]
 - [x] mulsf3.c
 - [x] powidf2.c
 - [x] powisf2.c
+- [ ] riscv/muldi3.S
+- [ ] riscv/mulsi3.S
 - [x] subdf3.c
 - [x] subsf3.c
-- [ ] truncdfhf2.c
 - [x] truncdfsf2.c
-- [ ] truncsfhf2.c
 - [x] udivdi3.c
 - [x] udivmoddi4.c
 - [x] udivmodsi4.c
@@ -192,9 +195,8 @@ features = ["c"]
 - [x] umoddi3.c
 - [x] umodsi3.c
 - [x] x86_64/chkstk.S
-- [x] x86_64/chkstk2.S
 
-These builtins are needed to support 128-bit integers, which are in the process of being added to Rust.
+These builtins are needed to support 128-bit integers.
 
 - [x] ashlti3.c
 - [x] ashrti3.c
@@ -215,62 +217,123 @@ These builtins are needed to support 128-bit integers, which are in the process
 - [x] udivti3.c
 - [x] umodti3.c
 
+These builtins are needed to support `f16` and `f128`, which are in the process
+of being added to Rust.
+
+- [x] addtf3.c
+- [x] comparetf2.c
+- [x] divtf3.c
+- [x] extenddftf2.c
+- [x] extendhfsf2.c
+- [x] extendhftf2.c
+- [x] extendsftf2.c
+- [x] fixtfdi.c
+- [x] fixtfsi.c
+- [x] fixtfti.c
+- [x] fixunstfdi.c
+- [x] fixunstfsi.c
+- [x] fixunstfti.c
+- [ ] floatditf.c
+- [ ] floatsitf.c
+- [ ] floattitf.c
+- [ ] floatunditf.c
+- [ ] floatunsitf.c
+- [ ] floatuntitf.c
+- [x] multf3.c
+- [x] powitf2.c
+- [x] subtf3.c
+- [x] truncdfhf2.c
+- [x] truncsfhf2.c
+- [x] trunctfdf2.c
+- [x] trunctfhf2.c
+- [x] trunctfsf2.c
+
+
+These builtins are used by the Hexagon DSP
+
+- [ ] hexagon/common_entry_exit_abi1.S
+- [ ] hexagon/common_entry_exit_abi2.S
+- [ ] hexagon/common_entry_exit_legacy.S
+- [x] hexagon/dfaddsub.S~~
+- [x] hexagon/dfdiv.S~~
+- [x] hexagon/dffma.S~~
+- [x] hexagon/dfminmax.S~~
+- [x] hexagon/dfmul.S~~
+- [x] hexagon/dfsqrt.S~~
+- [x] hexagon/divdi3.S~~
+- [x] hexagon/divsi3.S~~
+- [x] hexagon/fastmath2_dlib_asm.S~~
+- [x] hexagon/fastmath2_ldlib_asm.S~~
+- [x] hexagon/fastmath_dlib_asm.S~~
+- [x] hexagon/memcpy_forward_vp4cp4n2.S~~
+- [x] hexagon/memcpy_likely_aligned.S~~
+- [x] hexagon/moddi3.S~~
+- [x] hexagon/modsi3.S~~
+- [x] hexagon/sfdiv_opt.S~~
+- [x] hexagon/sfsqrt_opt.S~~
+- [x] hexagon/udivdi3.S~~
+- [x] hexagon/udivmoddi4.S~~
+- [x] hexagon/udivmodsi4.S~~
+- [x] hexagon/udivsi3.S~~
+- [x] hexagon/umoddi3.S~~
+- [x] hexagon/umodsi3.S~~
+
 ## Unimplemented functions
 
-These builtins involve floating-point types ("`f128`", "`f80`" and complex numbers) that are not supported by Rust.
+These builtins are for x87 `f80` floating-point numbers that are not supported
+by Rust.
 
-- ~~addtf3.c~~
-- ~~comparetf2.c~~
-- ~~divdc3.c~~
-- ~~divsc3.c~~
-- ~~divtc3.c~~
-- ~~divtf3.c~~
-- ~~divxc3.c~~
-- ~~extenddftf2.c~~
-- ~~extendsftf2.c~~
-- ~~fixtfdi.c~~
-- ~~fixtfsi.c~~
-- ~~fixtfti.c~~
-- ~~fixunstfdi.c~~
-- ~~fixunstfsi.c~~
-- ~~fixunstfti.c~~
+- ~~extendxftf2.c~~
 - ~~fixunsxfdi.c~~
 - ~~fixunsxfsi.c~~
 - ~~fixunsxfti.c~~
 - ~~fixxfdi.c~~
 - ~~fixxfti.c~~
-- ~~floatditf.c~~
 - ~~floatdixf.c~~
-- ~~floatsitf.c~~
 - ~~floattixf.c~~
-- ~~floatunditf.c~~
 - ~~floatundixf.c~~
-- ~~floatunsitf.c~~
 - ~~floatuntixf.c~~
 - ~~i386/floatdixf.S~~
 - ~~i386/floatundixf.S~~
-- ~~muldc3.c~~
-- ~~mulsc3.c~~
-- ~~multc3.c~~
-- ~~multf3.c~~
-- ~~mulxc3.c~~
-- ~~powitf2.c~~
-- ~~powixf2.c~~
+- ~~x86_64/floatdixf.c~~
+- ~~x86_64/floatundixf.S~~
+
+These builtins are for IBM "extended double" non-IEEE 128-bit floating-point
+numbers.
+
 - ~~ppc/divtc3.c~~
 - ~~ppc/fixtfdi.c~~
+- ~~ppc/fixtfti.c~~
 - ~~ppc/fixunstfdi.c~~
+- ~~ppc/fixunstfti.c~~
 - ~~ppc/floatditf.c~~
+- ~~ppc/floattitf.c~~
 - ~~ppc/floatunditf.c~~
 - ~~ppc/gcc_qadd.c~~
 - ~~ppc/gcc_qdiv.c~~
 - ~~ppc/gcc_qmul.c~~
 - ~~ppc/gcc_qsub.c~~
 - ~~ppc/multc3.c~~
-- ~~subtf3.c~~
-- ~~trunctfdf2.c~~
-- ~~trunctfsf2.c~~
-- ~~x86_64/floatdixf.c~~
-- ~~x86_64/floatundixf.S~~
+
+These builtins are for 16-bit brain floating-point numbers that are not
+supported by Rust.
+
+- ~~truncdfbf2.c~~
+- ~~truncsfbf2.c~~
+- ~~trunctfxf2.c~~
+
+These builtins involve complex floating-point types that are not supported by
+Rust.
+
+- ~~divdc3.c~~
+- ~~divsc3.c~~
+- ~~divtc3.c~~
+- ~~divxc3.c~~
+- ~~muldc3.c~~
+- ~~mulsc3.c~~
+- ~~multc3.c~~
+- ~~mulxc3.c~~
+- ~~powixf2.c~~
 
 These builtins are never called by LLVM.
 
@@ -299,14 +362,9 @@ These builtins are never called by LLVM.
 - ~~arm/switch32.S~~
 - ~~arm/switch8.S~~
 - ~~arm/switchu8.S~~
-- ~~clzdi2.c~~
-- ~~clzsi2.c~~
-- ~~clzti2.c~~
 - ~~cmpdi2.c~~
 - ~~cmpti2.c~~
-- ~~ctzdi2.c~~
-- ~~ctzsi2.c~~
-- ~~ctzti2.c~~
+- ~~ffssi2.c~~
 - ~~ffsdi2.c~~ - this is [called by gcc][jemalloc-fail] though!
 - ~~ffsti2.c~~
 - ~~mulvdi3.c~~
@@ -369,13 +427,34 @@ Rust only exposes atomic types on platforms that support them, and therefore doe
 
 Miscellaneous functionality that is not used by Rust.
 
+- ~~aarch64/fp_mode.c~~
+- ~~aarch64/lse.S~~ (LSE atomics)
+- ~~aarch64/sme-abi-init.c~~ (matrix extension)
+- ~~aarch64/sme-abi.S~~ (matrix extension)
+- ~~aarch64/sme-libc-routines.c~~ (matrix extension)
 - ~~apple_versioning.c~~
+- ~~arm/fp_mode.c~~
+- ~~avr/exit.S~~
 - ~~clear_cache.c~~
+- ~~cpu_model/aarch64.c~~
+- ~~cpu_model/x86.c~~
+- ~~crtbegin.c~~
+- ~~crtend.c~~
 - ~~emutls.c~~
 - ~~enable_execute_stack.c~~
 - ~~eprintf.c~~
+- ~~fp_mode.c~~ (float exception handling)
 - ~~gcc_personality_v0.c~~
+- ~~i386/fp_mode.c~~
+- ~~int_util.c~~
+- ~~loongarch/fp_mode.c~~
+- ~~os_version_check.c~~
+- ~~riscv/fp_mode.c~~
+- ~~riscv/restore.S~~ (callee-saved registers)
+- ~~riscv/save.S~~ (callee-saved registers)
 - ~~trampoline_setup.c~~
+- ~~ve/grow_stack.S~~
+- ~~ve/grow_stack_align.S~~
 
 Floating-point implementations of builtins that are only called from soft-float code. It would be better to simply use the generic soft-float versions in this case.
 
@@ -388,6 +467,43 @@ Floating-point implementations of builtins that are only called from soft-float
 - ~~x86_64/floatdidf.c~~
 - ~~x86_64/floatdisf.c~~
 
+Unsupported in any current target: used on old versions of 32-bit iOS with ARMv5.
+
+- ~~arm/adddf3vfp.S~~
+- ~~arm/addsf3vfp.S~~
+- ~~arm/divdf3vfp.S~~
+- ~~arm/divsf3vfp.S~~
+- ~~arm/eqdf2vfp.S~~
+- ~~arm/eqsf2vfp.S~~
+- ~~arm/extendsfdf2vfp.S~~
+- ~~arm/fixdfsivfp.S~~
+- ~~arm/fixsfsivfp.S~~
+- ~~arm/fixunsdfsivfp.S~~
+- ~~arm/fixunssfsivfp.S~~
+- ~~arm/floatsidfvfp.S~~
+- ~~arm/floatsisfvfp.S~~
+- ~~arm/floatunssidfvfp.S~~
+- ~~arm/floatunssisfvfp.S~~
+- ~~arm/gedf2vfp.S~~
+- ~~arm/gesf2vfp.S~~
+- ~~arm/gtdf2vfp.S~~
+- ~~arm/gtsf2vfp.S~~
+- ~~arm/ledf2vfp.S~~
+- ~~arm/lesf2vfp.S~~
+- ~~arm/ltdf2vfp.S~~
+- ~~arm/ltsf2vfp.S~~
+- ~~arm/muldf3vfp.S~~
+- ~~arm/mulsf3vfp.S~~
+- ~~arm/nedf2vfp.S~~
+- ~~arm/negdf2vfp.S~~
+- ~~arm/negsf2vfp.S~~
+- ~~arm/nesf2vfp.S~~
+- ~~arm/subdf3vfp.S~~
+- ~~arm/subsf3vfp.S~~
+- ~~arm/truncdfsf2vfp.S~~
+- ~~arm/unorddf2vfp.S~~
+- ~~arm/unordsf2vfp.S~~
+
 ## License
 
 The compiler-builtins crate is dual licensed under both the University of
diff --git a/build.rs b/build.rs
index 4549d0b4..f789cbfe 100644
--- a/build.rs
+++ b/build.rs
@@ -1,23 +1,33 @@
-use std::{collections::HashMap, env, sync::atomic::Ordering};
+use std::{collections::BTreeMap, env, path::PathBuf, sync::atomic::Ordering};
+
+mod configure;
+
+use configure::{configure_f16_f128, Target};
 
 fn main() {
-    println!("cargo:rerun-if-changed=build.rs");
+    println!("cargo::rerun-if-changed=build.rs");
+    println!("cargo::rerun-if-changed=configure.rs");
 
-    let target = env::var("TARGET").unwrap();
+    let target = Target::from_env();
     let cwd = env::current_dir().unwrap();
 
+    configure_check_cfg();
+    configure_f16_f128(&target);
+
     println!("cargo:compiler-rt={}", cwd.join("compiler-rt").display());
 
     // Activate libm's unstable features to make full use of Nightly.
+    println!("cargo::rustc-check-cfg=cfg(feature, values(\"unstable\", \"force-soft-floats\"))");
     println!("cargo:rustc-cfg=feature=\"unstable\"");
+    println!("cargo:rustc-cfg=feature=\"force-soft-floats\"");
 
     // Emscripten's runtime includes all the builtins
-    if target.contains("emscripten") {
+    if target.os == "emscripten" {
         return;
     }
 
     // OpenBSD provides compiler_rt by default, use it instead of rebuilding it from source
-    if target.contains("openbsd") {
+    if target.os == "openbsd" {
         println!("cargo:rustc-link-search=native=/usr/lib");
         println!("cargo:rustc-link-lib=compiler_rt");
         return;
@@ -25,20 +35,23 @@ fn main() {
 
     // Forcibly enable memory intrinsics on wasm & SGX as we don't have a libc to
     // provide them.
-    if (target.contains("wasm") && !target.contains("wasi"))
-        || (target.contains("sgx") && target.contains("fortanix"))
-        || target.contains("-none")
-        || target.contains("nvptx")
-        || target.contains("uefi")
+    if (target.triple.contains("wasm") && !target.triple.contains("wasi"))
+        || (target.triple.contains("sgx") && target.triple.contains("fortanix"))
+        || target.triple.contains("-none")
+        || target.triple.contains("nvptx")
+        || target.triple.contains("uefi")
+        || target.triple.contains("xous")
     {
         println!("cargo:rustc-cfg=feature=\"mem\"");
     }
 
     // These targets have hardware unaligned access support.
-    if target.contains("x86_64")
-        || target.contains("i686")
-        || target.contains("aarch64")
-        || target.contains("bpf")
+    println!("cargo::rustc-check-cfg=cfg(feature, values(\"mem-unaligned\"))");
+    if target.arch.contains("x86_64")
+        || target.arch.contains("x86")
+        || target.arch.contains("aarch64")
+        || target.arch.contains("bpf")
+        || target.arch.contains("sbf")
     {
         println!("cargo:rustc-cfg=feature=\"mem-unaligned\"");
     }
@@ -46,7 +59,7 @@ fn main() {
     // NOTE we are going to assume that llvm-target, what determines our codegen option, matches the
     // target triple. This is usually correct for our built-in targets but can break in presence of
     // custom targets, which can have arbitrary names.
-    let llvm_target = target.split('-').collect::<Vec<_>>();
+    let llvm_target = target.triple.split('-').collect::<Vec<_>>();
 
     // Build missing intrinsics from compiler-rt C source code. If we're
     // mangling names though we assume that we're also in test mode so we don't
@@ -55,21 +68,15 @@ fn main() {
     if !cfg!(feature = "mangled-names") && cfg!(feature = "c") {
         // Don't use a C compiler for these targets:
         //
-        // * wasm - clang for wasm is somewhat hard to come by and it's
-        //   unlikely that the C is really that much better than our own Rust.
         // * nvptx - everything is bitcode, not compatible with mixed C/Rust
-        // * riscv - the rust-lang/rust distribution container doesn't have a C
-        //   compiler.
-        if !target.contains("wasm")
-            && !target.contains("nvptx")
-            && (!target.starts_with("riscv") || target.contains("xous"))
-        {
+        if !target.arch.contains("nvptx") {
             #[cfg(feature = "c")]
             c::compile(&llvm_target, &target);
         }
     }
 
     // To compile intrinsics.rs for thumb targets, where there is no libc
+    println!("cargo::rustc-check-cfg=cfg(thumb)");
     if llvm_target[0].starts_with("thumb") {
         println!("cargo:rustc-cfg=thumb")
     }
@@ -77,6 +84,7 @@ fn main() {
     // compiler-rt `cfg`s away some intrinsics for thumbv6m and thumbv8m.base because
     // these targets do not have full Thumb-2 support but only original Thumb-1.
     // We have to cfg our code accordingly.
+    println!("cargo::rustc-check-cfg=cfg(thumb_1)");
     if llvm_target[0] == "thumbv6m" || llvm_target[0] == "thumbv8m.base" {
         println!("cargo:rustc-cfg=thumb_1")
     }
@@ -84,14 +92,15 @@ fn main() {
     // Only emit the ARM Linux atomic emulation on pre-ARMv6 architectures. This
     // includes the old androideabi. It is deprecated but it is available as a
     // rustc target (arm-linux-androideabi).
+    println!("cargo::rustc-check-cfg=cfg(kernel_user_helpers)");
     if llvm_target[0] == "armv4t"
         || llvm_target[0] == "armv5te"
-        || target == "arm-linux-androideabi"
+        || target.triple == "arm-linux-androideabi"
     {
         println!("cargo:rustc-cfg=kernel_user_helpers")
     }
 
-    if llvm_target[0] == "aarch64" {
+    if llvm_target[0].starts_with("aarch64") {
         generate_aarch64_outlined_atomics();
     }
 }
@@ -117,7 +126,7 @@ fn generate_aarch64_outlined_atomics() {
 
     // Generate different macros for add/clr/eor/set so that we can test them separately.
     let sym_names = ["cas", "ldadd", "ldclr", "ldeor", "ldset", "swp"];
-    let mut macros = HashMap::new();
+    let mut macros = BTreeMap::new();
     for sym in sym_names {
         macros.insert(sym, gen_macro(sym));
     }
@@ -145,22 +154,90 @@ fn generate_aarch64_outlined_atomics() {
     let mut buf = String::new();
     for macro_def in macros.values().chain(std::iter::once(&cas16)) {
         buf += macro_def;
-        buf += "}; }";
+        buf += "}; }\n";
     }
-    let dst = std::env::var("OUT_DIR").unwrap() + "/outlined_atomics.rs";
-    std::fs::write(dst, buf).unwrap();
+    let out_dir = PathBuf::from(std::env::var("OUT_DIR").unwrap());
+    std::fs::write(out_dir.join("outlined_atomics.rs"), buf).unwrap();
+}
+
+/// Emit directives for features we expect to support that aren't in `Cargo.toml`.
+///
+/// These are mostly cfg elements emitted by this `build.rs`.
+fn configure_check_cfg() {
+    // Functions where we can set the "optimized-c" flag
+    const HAS_OPTIMIZED_C: &[&str] = &[
+        "__ashldi3",
+        "__ashlsi3",
+        "__ashrdi3",
+        "__ashrsi3",
+        "__bswapsi2",
+        "__bswapdi2",
+        "__bswapti2",
+        "__divdi3",
+        "__divsi3",
+        "__divmoddi4",
+        "__divmodsi4",
+        "__divmodsi4",
+        "__divmodti4",
+        "__lshrdi3",
+        "__lshrsi3",
+        "__moddi3",
+        "__modsi3",
+        "__muldi3",
+        "__udivdi3",
+        "__udivmoddi4",
+        "__udivmodsi4",
+        "__udivsi3",
+        "__umoddi3",
+        "__umodsi3",
+    ];
+
+    // Build a list of all aarch64 atomic operation functions
+    let mut aarch_atomic = Vec::new();
+    for aarch_op in ["cas", "ldadd", "ldclr", "ldeor", "ldset", "swp"] {
+        let op_sizes = if aarch_op == "cas" {
+            [1, 2, 4, 8, 16].as_slice()
+        } else {
+            [1, 2, 4, 8].as_slice()
+        };
+
+        for op_size in op_sizes {
+            for ordering in ["relax", "acq", "rel", "acq_rel"] {
+                aarch_atomic.push(format!("__aarch64_{}{}_{}", aarch_op, op_size, ordering));
+            }
+        }
+    }
+
+    for fn_name in HAS_OPTIMIZED_C
+        .iter()
+        .copied()
+        .chain(aarch_atomic.iter().map(|s| s.as_str()))
+    {
+        println!(
+            "cargo::rustc-check-cfg=cfg({}, values(\"optimized-c\"))",
+            fn_name
+        );
+    }
+
+    // Rustc is unaware of sparc target features, but this does show up from
+    // `rustc --print target-features --target sparc64-unknown-linux-gnu`.
+    println!("cargo::rustc-check-cfg=cfg(target_feature, values(\"vis3\"))");
+
+    // FIXME: these come from libm and should be changed there
+    println!("cargo::rustc-check-cfg=cfg(feature, values(\"checked\"))");
+    println!("cargo::rustc-check-cfg=cfg(assert_no_panic)");
 }
 
 #[cfg(feature = "c")]
 mod c {
-    extern crate cc;
-
     use std::collections::{BTreeMap, HashSet};
     use std::env;
     use std::fs::{self, File};
     use std::io::Write;
     use std::path::{Path, PathBuf};
 
+    use super::Target;
+
     struct Sources {
         // SYMBOL -> PATH TO SOURCE
         map: BTreeMap<&'static str, &'static str>,
@@ -201,11 +278,7 @@ mod c {
     }
 
     /// Compile intrinsics from the compiler-rt C source code
-    pub fn compile(llvm_target: &[&str], target: &String) {
-        let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap();
-        let target_env = env::var("CARGO_CFG_TARGET_ENV").unwrap();
-        let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap();
-        let target_vendor = env::var("CARGO_CFG_TARGET_VENDOR").unwrap();
+    pub fn compile(llvm_target: &[&str], target: &Target) {
         let mut consider_float_intrinsics = true;
         let cfg = &mut cc::Build::new();
 
@@ -214,8 +287,8 @@ mod c {
         //
         // Therefore, evaluate if those flags are present and set a boolean that causes any
         // compiler-rt intrinsics that contain floating point source to be excluded for this target.
-        if target_arch == "aarch64" {
-            let cflags_key = String::from("CFLAGS_") + &(target.to_owned().replace("-", "_"));
+        if target.arch == "aarch64" {
+            let cflags_key = String::from("CFLAGS_") + &(target.triple.replace("-", "_"));
             if let Ok(cflags_value) = env::var(cflags_key) {
                 if cflags_value.contains("+nofp") || cflags_value.contains("+nosimd") {
                     consider_float_intrinsics = false;
@@ -223,9 +296,17 @@ mod c {
             }
         }
 
+        // `compiler-rt` requires `COMPILER_RT_HAS_FLOAT16` to be defined to make it use the
+        // `_Float16` type for `f16` intrinsics. This shouldn't matter as all existing `f16`
+        // intrinsics have been ported to Rust in `compiler-builtins` as C compilers don't
+        // support `_Float16` on all targets (whereas Rust does). However, define the macro
+        // anyway to prevent issues like rust#118813 and rust#123885 silently reoccuring if more
+        // `f16` intrinsics get accidentally added here in the future.
+        cfg.define("COMPILER_RT_HAS_FLOAT16", None);
+
         cfg.warnings(false);
 
-        if target_env == "msvc" {
+        if target.env == "msvc" {
             // Don't pull in extra libraries on MSVC
             cfg.flag("/Zl");
 
@@ -247,6 +328,16 @@ mod c {
             // in https://github.com/rust-lang/compiler-rt/blob/c8fbcb3/cmake/config-ix.cmake#L19.
             cfg.flag_if_supported("-fomit-frame-pointer");
             cfg.define("VISIBILITY_HIDDEN", None);
+
+            if let "aarch64" | "arm64ec" = target.arch.as_str() {
+                // FIXME(llvm20): Older GCCs on A64 fail to build with
+                // -Werror=implicit-function-declaration due to a compiler-rt bug.
+                // With a newer LLVM we should be able to enable the flag everywhere.
+                // https://github.com/llvm/llvm-project/commit/8aa9d6206ce55bdaaf422839c351fbd63f033b89
+            } else {
+                // Avoid implicitly creating references to undefined functions
+                cfg.flag("-Werror=implicit-function-declaration");
+            }
         }
 
         // int_util.c tries to include stdlib.h if `_WIN32` is defined,
@@ -254,7 +345,7 @@ mod c {
         // at odds with compiling with `-ffreestanding`, as the header
         // may be incompatible or not present. Create a minimal stub
         // header to use instead.
-        if target_os == "uefi" {
+        if target.os == "uefi" {
             let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
             let include_dir = out_dir.join("include");
             if !include_dir.exists() {
@@ -270,11 +361,7 @@ mod c {
             ("__absvsi2", "absvsi2.c"),
             ("__addvdi3", "addvdi3.c"),
             ("__addvsi3", "addvsi3.c"),
-            ("__clzdi2", "clzdi2.c"),
-            ("__clzsi2", "clzsi2.c"),
             ("__cmpdi2", "cmpdi2.c"),
-            ("__ctzdi2", "ctzdi2.c"),
-            ("__ctzsi2", "ctzsi2.c"),
             ("__int_util", "int_util.c"),
             ("__mulvdi3", "mulvdi3.c"),
             ("__mulvsi3", "mulvsi3.c"),
@@ -294,39 +381,20 @@ mod c {
             sources.extend(&[
                 ("__divdc3", "divdc3.c"),
                 ("__divsc3", "divsc3.c"),
-                ("__divxc3", "divxc3.c"),
-                ("__extendhfsf2", "extendhfsf2.c"),
                 ("__muldc3", "muldc3.c"),
                 ("__mulsc3", "mulsc3.c"),
-                ("__mulxc3", "mulxc3.c"),
                 ("__negdf2", "negdf2.c"),
                 ("__negsf2", "negsf2.c"),
-                ("__powixf2", "powixf2.c"),
-                ("__truncdfhf2", "truncdfhf2.c"),
-                ("__truncsfhf2", "truncsfhf2.c"),
             ]);
         }
 
-        // When compiling in rustbuild (the rust-lang/rust repo) this library
-        // also needs to satisfy intrinsics that jemalloc or C in general may
-        // need, so include a few more that aren't typically needed by
-        // LLVM/Rust.
-        if cfg!(feature = "rustbuild") {
-            sources.extend(&[("__ffsdi2", "ffsdi2.c")]);
-        }
-
         // On iOS and 32-bit OSX these are all just empty intrinsics, no need to
         // include them.
-        if target_os != "ios"
-            && target_os != "watchos"
-            && (target_vendor != "apple" || target_arch != "x86")
-        {
+        if target.vendor != "apple" || target.arch != "x86" {
             sources.extend(&[
                 ("__absvti2", "absvti2.c"),
                 ("__addvti3", "addvti3.c"),
-                ("__clzti2", "clzti2.c"),
                 ("__cmpti2", "cmpti2.c"),
-                ("__ctzti2", "ctzti2.c"),
                 ("__ffsti2", "ffsti2.c"),
                 ("__mulvti3", "mulvti3.c"),
                 ("__negti2", "negti2.c"),
@@ -341,7 +409,7 @@ mod c {
             }
         }
 
-        if target_vendor == "apple" {
+        if target.vendor == "apple" {
             sources.extend(&[
                 ("atomic_flag_clear", "atomic_flag_clear.c"),
                 ("atomic_flag_clear_explicit", "atomic_flag_clear_explicit.c"),
@@ -355,29 +423,12 @@ mod c {
             ]);
         }
 
-        if target_env == "msvc" {
-            if target_arch == "x86_64" {
-                sources.extend(&[("__floatdixf", "x86_64/floatdixf.c")]);
-            }
-        } else {
-            // None of these seem to be used on x86_64 windows, and they've all
-            // got the wrong ABI anyway, so we want to avoid them.
-            if target_os != "windows" {
-                if target_arch == "x86_64" {
-                    sources.extend(&[
-                        ("__floatdixf", "x86_64/floatdixf.c"),
-                        ("__floatundixf", "x86_64/floatundixf.S"),
-                    ]);
-                }
-            }
-
-            if target_arch == "x86" {
+        if target.env != "msvc" {
+            if target.arch == "x86" {
                 sources.extend(&[
                     ("__ashldi3", "i386/ashldi3.S"),
                     ("__ashrdi3", "i386/ashrdi3.S"),
                     ("__divdi3", "i386/divdi3.S"),
-                    ("__floatdixf", "i386/floatdixf.S"),
-                    ("__floatundixf", "i386/floatundixf.S"),
                     ("__lshrdi3", "i386/lshrdi3.S"),
                     ("__moddi3", "i386/moddi3.S"),
                     ("__muldi3", "i386/muldi3.S"),
@@ -387,19 +438,13 @@ mod c {
             }
         }
 
-        if target_arch == "arm"
-            && target_os != "ios"
-            && target_os != "watchos"
-            && target_env != "msvc"
-        {
+        if target.arch == "arm" && target.vendor != "apple" && target.env != "msvc" {
             sources.extend(&[
                 ("__aeabi_div0", "arm/aeabi_div0.c"),
                 ("__aeabi_drsub", "arm/aeabi_drsub.c"),
                 ("__aeabi_frsub", "arm/aeabi_frsub.c"),
                 ("__bswapdi2", "arm/bswapdi2.S"),
                 ("__bswapsi2", "arm/bswapsi2.S"),
-                ("__clzdi2", "arm/clzdi2.S"),
-                ("__clzsi2", "arm/clzsi2.S"),
                 ("__divmodsi4", "arm/divmodsi4.S"),
                 ("__divsi3", "arm/divsi3.S"),
                 ("__modsi3", "arm/modsi3.S"),
@@ -413,7 +458,7 @@ mod c {
                 ("__umodsi3", "arm/umodsi3.S"),
             ]);
 
-            if target_os == "freebsd" {
+            if target.os == "freebsd" {
                 sources.extend(&[("__clear_cache", "clear_cache.c")]);
             }
 
@@ -485,61 +530,46 @@ mod c {
             ]);
         }
 
-        if target_arch == "aarch64" && consider_float_intrinsics {
+        if (target.arch == "aarch64" || target.arch == "arm64ec") && consider_float_intrinsics {
             sources.extend(&[
                 ("__comparetf2", "comparetf2.c"),
-                ("__extenddftf2", "extenddftf2.c"),
-                ("__extendsftf2", "extendsftf2.c"),
-                ("__fixtfdi", "fixtfdi.c"),
-                ("__fixtfsi", "fixtfsi.c"),
-                ("__fixtfti", "fixtfti.c"),
-                ("__fixunstfdi", "fixunstfdi.c"),
-                ("__fixunstfsi", "fixunstfsi.c"),
-                ("__fixunstfti", "fixunstfti.c"),
                 ("__floatditf", "floatditf.c"),
                 ("__floatsitf", "floatsitf.c"),
                 ("__floatunditf", "floatunditf.c"),
                 ("__floatunsitf", "floatunsitf.c"),
-                ("__trunctfdf2", "trunctfdf2.c"),
-                ("__trunctfsf2", "trunctfsf2.c"),
-                ("__addtf3", "addtf3.c"),
-                ("__multf3", "multf3.c"),
-                ("__subtf3", "subtf3.c"),
-                ("__divtf3", "divtf3.c"),
-                ("__powitf2", "powitf2.c"),
                 ("__fe_getround", "fp_mode.c"),
                 ("__fe_raise_inexact", "fp_mode.c"),
             ]);
 
-            if target_os != "windows" {
+            if target.os != "windows" {
                 sources.extend(&[("__multc3", "multc3.c")]);
             }
         }
 
-        if target_arch == "mips" {
+        if target.arch == "mips" || target.arch == "riscv32" || target.arch == "riscv64" {
             sources.extend(&[("__bswapsi2", "bswapsi2.c")]);
         }
 
-        if target_arch == "mips64" {
+        if target.arch == "mips64" {
+            sources.extend(&[
+                ("__netf2", "comparetf2.c"),
+                ("__floatsitf", "floatsitf.c"),
+                ("__floatunsitf", "floatunsitf.c"),
+                ("__fe_getround", "fp_mode.c"),
+            ]);
+        }
+
+        if target.arch == "loongarch64" {
             sources.extend(&[
-                ("__extenddftf2", "extenddftf2.c"),
                 ("__netf2", "comparetf2.c"),
-                ("__addtf3", "addtf3.c"),
-                ("__multf3", "multf3.c"),
-                ("__subtf3", "subtf3.c"),
-                ("__fixtfsi", "fixtfsi.c"),
                 ("__floatsitf", "floatsitf.c"),
-                ("__fixunstfsi", "fixunstfsi.c"),
                 ("__floatunsitf", "floatunsitf.c"),
                 ("__fe_getround", "fp_mode.c"),
-                ("__divtf3", "divtf3.c"),
-                ("__trunctfdf2", "trunctfdf2.c"),
-                ("__trunctfsf2", "trunctfsf2.c"),
             ]);
         }
 
         // Remove the assembly implementations that won't compile for the target
-        if llvm_target[0] == "thumbv6m" || llvm_target[0] == "thumbv8m.base" || target_os == "uefi"
+        if llvm_target[0] == "thumbv6m" || llvm_target[0] == "thumbv8m.base" || target.os == "uefi"
         {
             let mut to_remove = Vec::new();
             for (k, v) in sources.map.iter() {
@@ -548,9 +578,6 @@ mod c {
                 }
             }
             sources.remove(&to_remove);
-
-            // But use some generic implementations where possible
-            sources.extend(&[("__clzdi2", "clzdi2.c"), ("__clzsi2", "clzsi2.c")])
         }
 
         if llvm_target[0] == "thumbv7m" || llvm_target[0] == "thumbv7em" {
@@ -558,7 +585,7 @@ mod c {
         }
 
         // Android uses emulated TLS so we need a runtime support function.
-        if target_os == "android" {
+        if target.os == "android" {
             sources.extend(&[("__emutls_get_address", "emutls.c")]);
 
             // Work around a bug in the NDK headers (fixed in
@@ -568,17 +595,36 @@ mod c {
         }
 
         // OpenHarmony also uses emulated TLS.
-        if target_env == "ohos" {
+        if target.env == "ohos" {
             sources.extend(&[("__emutls_get_address", "emutls.c")]);
         }
 
+        if target.os == "solana" {
+            cfg.define("__ELF__", None);
+            // Use the static-syscall target feature to detect if we're
+            // compiling for sbfv2, in which case set the corresponding clang
+            // cpu flag.
+            if target.features.iter().any(|el| el == "static-syscalls") {
+                cfg.flag("-mcpu=sbfv2");
+            }
+            // Remove the implementations that fail to build.
+            // This list should shrink to zero
+            sources.remove(&[
+                "__int_util", // Unsupported architecture error
+                "__mulvdi3",  // Unsupported signed division
+                "__mulvsi3",  // Unsupported signed division
+            ]);
+        }
+
         // When compiling the C code we require the user to tell us where the
         // source code is, and this is largely done so when we're compiling as
         // part of rust-lang/rust we can use the same llvm-project repository as
         // rust-lang/rust.
         let root = match env::var_os("RUST_COMPILER_RT_ROOT") {
             Some(s) => PathBuf::from(s),
-            None => panic!("RUST_COMPILER_RT_ROOT is not set"),
+            None => {
+                panic!("RUST_COMPILER_RT_ROOT is not set. You may need to download compiler-rt.")
+            }
         };
         if !root.exists() {
             panic!("RUST_COMPILER_RT_ROOT={} does not exist", root.display());
@@ -593,12 +639,17 @@ mod c {
         // sets of flags to the same source file.
         // Note: Out-of-line aarch64 atomics are not supported by the msvc toolchain (#430).
         let src_dir = root.join("lib/builtins");
-        if target_arch == "aarch64" && target_env != "msvc" {
+        if target.arch == "aarch64" && target.env != "msvc" {
             // See below for why we're building these as separate libraries.
             build_aarch64_out_of_line_atomics_libraries(&src_dir, cfg);
 
             // Some run-time CPU feature detection is necessary, as well.
-            sources.extend(&[("__aarch64_have_lse_atomics", "cpu_model.c")]);
+            let cpu_model_src = if src_dir.join("cpu_model.c").exists() {
+                "cpu_model.c"
+            } else {
+                "cpu_model/aarch64.c"
+            };
+            sources.extend(&[("__aarch64_have_lse_atomics", cpu_model_src)]);
         }
 
         let mut added_sources = HashSet::new();
@@ -616,7 +667,7 @@ mod c {
 
     fn build_aarch64_out_of_line_atomics_libraries(builtins_dir: &Path, cfg: &mut cc::Build) {
         let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
-        let outlined_atomics_file = builtins_dir.join("aarch64/lse.S");
+        let outlined_atomics_file = builtins_dir.join("aarch64").join("lse.S");
         println!("cargo:rerun-if-changed={}", outlined_atomics_file.display());
 
         cfg.include(&builtins_dir);
diff --git a/ci/docker/aarch64-unknown-linux-gnu/Dockerfile b/ci/docker/aarch64-unknown-linux-gnu/Dockerfile
index 9e2559f4..1aef14a9 100644
--- a/ci/docker/aarch64-unknown-linux-gnu/Dockerfile
+++ b/ci/docker/aarch64-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,5 @@
-FROM ubuntu:18.04
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     gcc libc6-dev ca-certificates \
diff --git a/ci/docker/arm-unknown-linux-gnueabi/Dockerfile b/ci/docker/arm-unknown-linux-gnueabi/Dockerfile
index afab874b..fc980377 100644
--- a/ci/docker/arm-unknown-linux-gnueabi/Dockerfile
+++ b/ci/docker/arm-unknown-linux-gnueabi/Dockerfile
@@ -1,4 +1,5 @@
-FROM ubuntu:18.04
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     gcc libc6-dev ca-certificates \
diff --git a/ci/docker/arm-unknown-linux-gnueabihf/Dockerfile b/ci/docker/arm-unknown-linux-gnueabihf/Dockerfile
index 3ed3602b..a127f67c 100644
--- a/ci/docker/arm-unknown-linux-gnueabihf/Dockerfile
+++ b/ci/docker/arm-unknown-linux-gnueabihf/Dockerfile
@@ -1,4 +1,5 @@
-FROM ubuntu:18.04
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     gcc libc6-dev ca-certificates \
diff --git a/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile b/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile
index 6617af15..67a3e51a 100644
--- a/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile
+++ b/ci/docker/armv7-unknown-linux-gnueabihf/Dockerfile
@@ -1,4 +1,5 @@
-FROM ubuntu:18.04
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     gcc libc6-dev ca-certificates \
diff --git a/ci/docker/i586-unknown-linux-gnu/Dockerfile b/ci/docker/i586-unknown-linux-gnu/Dockerfile
index 5783e28e..15285d9b 100644
--- a/ci/docker/i586-unknown-linux-gnu/Dockerfile
+++ b/ci/docker/i586-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,5 @@
-FROM ubuntu:18.04
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     gcc-multilib libc6-dev ca-certificates
diff --git a/ci/docker/i686-unknown-linux-gnu/Dockerfile b/ci/docker/i686-unknown-linux-gnu/Dockerfile
index 5783e28e..15285d9b 100644
--- a/ci/docker/i686-unknown-linux-gnu/Dockerfile
+++ b/ci/docker/i686-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,5 @@
-FROM ubuntu:18.04
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     gcc-multilib libc6-dev ca-certificates
diff --git a/ci/docker/mips-unknown-linux-gnu/Dockerfile b/ci/docker/mips-unknown-linux-gnu/Dockerfile
index f47e8f52..a47dd9f1 100644
--- a/ci/docker/mips-unknown-linux-gnu/Dockerfile
+++ b/ci/docker/mips-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,5 @@
-FROM ubuntu:18.04
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
 
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
diff --git a/ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile b/ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile
index 8fa77c7b..688aa1ab 100644
--- a/ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile
+++ b/ci/docker/mips64-unknown-linux-gnuabi64/Dockerfile
@@ -1,4 +1,5 @@
-FROM ubuntu:18.04
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     ca-certificates \
diff --git a/ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile b/ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile
index c6611d9a..27d032a1 100644
--- a/ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile
+++ b/ci/docker/mips64el-unknown-linux-gnuabi64/Dockerfile
@@ -1,4 +1,5 @@
-FROM ubuntu:18.04
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     ca-certificates \
diff --git a/ci/docker/mipsel-unknown-linux-gnu/Dockerfile b/ci/docker/mipsel-unknown-linux-gnu/Dockerfile
index 0bc69562..4d18a6ed 100644
--- a/ci/docker/mipsel-unknown-linux-gnu/Dockerfile
+++ b/ci/docker/mipsel-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,5 @@
-FROM ubuntu:18.04
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
 
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
diff --git a/ci/docker/powerpc-unknown-linux-gnu/Dockerfile b/ci/docker/powerpc-unknown-linux-gnu/Dockerfile
index 2d39fef6..5225b833 100644
--- a/ci/docker/powerpc-unknown-linux-gnu/Dockerfile
+++ b/ci/docker/powerpc-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,5 @@
-FROM ubuntu:18.04
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
 
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
diff --git a/ci/docker/powerpc64-unknown-linux-gnu/Dockerfile b/ci/docker/powerpc64-unknown-linux-gnu/Dockerfile
index 653cd351..cbd78eac 100644
--- a/ci/docker/powerpc64-unknown-linux-gnu/Dockerfile
+++ b/ci/docker/powerpc64-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,5 @@
-FROM ubuntu:18.04
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
 
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
diff --git a/ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile b/ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile
index 63ea9af9..bad06429 100644
--- a/ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile
+++ b/ci/docker/powerpc64le-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,5 @@
-FROM ubuntu:18.04
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
 
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
diff --git a/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile b/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile
new file mode 100644
index 00000000..4d4a194f
--- /dev/null
+++ b/ci/docker/riscv64gc-unknown-linux-gnu/Dockerfile
@@ -0,0 +1,13 @@
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    gcc libc6-dev qemu-user-static ca-certificates \
+    gcc-riscv64-linux-gnu libc6-dev-riscv64-cross \
+    qemu-system-riscv64
+
+ENV CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_LINKER=riscv64-linux-gnu-gcc \
+    CARGO_TARGET_RISCV64GC_UNKNOWN_LINUX_GNU_RUNNER=qemu-riscv64-static \
+    QEMU_LD_PREFIX=/usr/riscv64-linux-gnu \
+    RUST_TEST_THREADS=1
diff --git a/ci/docker/sbf-solana-solana/Dockerfile b/ci/docker/sbf-solana-solana/Dockerfile
new file mode 100644
index 00000000..d60cc649
--- /dev/null
+++ b/ci/docker/sbf-solana-solana/Dockerfile
@@ -0,0 +1,23 @@
+FROM ubuntu:20.04
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    curl \
+    gcc libc6-dev ca-certificates
+
+ENV RUSTUP_INIT_SKIP_PATH_CHECK="yes"
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y -v --no-modify-path
+RUN cp ${HOME}/.cargo/bin/* /usr/local/bin/
+
+RUN cargo install --git https://github.com/solana-labs/cargo-run-solana-tests.git \
+    --rev df2f642924aee7bbd2566017b3d71cb0c389b015 \
+    --bin cargo-run-solana-tests --root /usr/local
+
+RUN mkdir -p /tmp/.cache/solana/v1.38/platform-tools
+RUN curl -L -o platform-tools-linux-x86_64.tar.bz2 https://github.com/solana-labs/platform-tools/releases/download/v1.38/platform-tools-linux-x86_64.tar.bz2
+RUN tar -xjf platform-tools-linux-x86_64.tar.bz2 --strip-components 1 -C /tmp/.cache/solana/v1.38/platform-tools
+RUN rustup toolchain link solana /tmp/.cache/solana/v1.38/platform-tools/rust
+RUN cp -R ${HOME}/.rustup /tmp/
+
+ENV CARGO_TARGET_SBF_SOLANA_SOLANA_RUNNER="cargo-run-solana-tests --heap-size 104857600"
+ENV CC="/tmp/.cache/solana/v1.38/platform-tools/llvm/bin/clang"
+ENV RUSTUP_TOOLCHAIN="solana"
diff --git a/ci/docker/thumbv6m-none-eabi/Dockerfile b/ci/docker/thumbv6m-none-eabi/Dockerfile
index 04d4f442..f966b2b9 100644
--- a/ci/docker/thumbv6m-none-eabi/Dockerfile
+++ b/ci/docker/thumbv6m-none-eabi/Dockerfile
@@ -1,7 +1,8 @@
-FROM ubuntu:18.04
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     gcc libc6-dev ca-certificates \
     gcc-arm-none-eabi \
     libnewlib-arm-none-eabi
-ENV XARGO=1
+ENV NO_STD=1
diff --git a/ci/docker/thumbv7em-none-eabi/Dockerfile b/ci/docker/thumbv7em-none-eabi/Dockerfile
index 04d4f442..f966b2b9 100644
--- a/ci/docker/thumbv7em-none-eabi/Dockerfile
+++ b/ci/docker/thumbv7em-none-eabi/Dockerfile
@@ -1,7 +1,8 @@
-FROM ubuntu:18.04
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     gcc libc6-dev ca-certificates \
     gcc-arm-none-eabi \
     libnewlib-arm-none-eabi
-ENV XARGO=1
+ENV NO_STD=1
diff --git a/ci/docker/thumbv7em-none-eabihf/Dockerfile b/ci/docker/thumbv7em-none-eabihf/Dockerfile
index 04d4f442..f966b2b9 100644
--- a/ci/docker/thumbv7em-none-eabihf/Dockerfile
+++ b/ci/docker/thumbv7em-none-eabihf/Dockerfile
@@ -1,7 +1,8 @@
-FROM ubuntu:18.04
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     gcc libc6-dev ca-certificates \
     gcc-arm-none-eabi \
     libnewlib-arm-none-eabi
-ENV XARGO=1
+ENV NO_STD=1
diff --git a/ci/docker/thumbv7m-none-eabi/Dockerfile b/ci/docker/thumbv7m-none-eabi/Dockerfile
index 04d4f442..f966b2b9 100644
--- a/ci/docker/thumbv7m-none-eabi/Dockerfile
+++ b/ci/docker/thumbv7m-none-eabi/Dockerfile
@@ -1,7 +1,8 @@
-FROM ubuntu:18.04
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     gcc libc6-dev ca-certificates \
     gcc-arm-none-eabi \
     libnewlib-arm-none-eabi
-ENV XARGO=1
+ENV NO_STD=1
diff --git a/ci/docker/wasm32-unknown-unknown/Dockerfile b/ci/docker/wasm32-unknown-unknown/Dockerfile
index 758d94d5..4d12b6ff 100644
--- a/ci/docker/wasm32-unknown-unknown/Dockerfile
+++ b/ci/docker/wasm32-unknown-unknown/Dockerfile
@@ -1,6 +1,7 @@
-FROM ubuntu:20.04
+ARG IMAGE=ubuntu:20.04
+FROM $IMAGE
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
-    gcc libc6-dev ca-certificates
+    gcc clang libc6-dev ca-certificates
 
 ENV CARGO_TARGET_WASM32_UNKNOWN_UNKNOWN_RUNNER=true
diff --git a/ci/docker/x86_64-unknown-linux-gnu/Dockerfile b/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
index 98000f4e..670c2439 100644
--- a/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
+++ b/ci/docker/x86_64-unknown-linux-gnu/Dockerfile
@@ -1,4 +1,5 @@
-FROM ubuntu:18.04
+ARG IMAGE=ubuntu:24.04
+FROM $IMAGE
 RUN apt-get update && \
     apt-get install -y --no-install-recommends \
     gcc libc6-dev ca-certificates
diff --git a/ci/run-docker.sh b/ci/run-docker.sh
index 8c4af0ef..215ad71a 100755
--- a/ci/run-docker.sh
+++ b/ci/run-docker.sh
@@ -1,38 +1,92 @@
+#!/bin/bash
+
 # Small script to run tests for a target (or all targets) inside all the
 # respective docker images.
 
-set -ex
+set -euxo pipefail
 
 run() {
-    local target=$1
+    local target="$1"
 
-    echo $target
+    echo "TESTING TARGET: $target"
 
     # This directory needs to exist before calling docker, otherwise docker will create it but it
     # will be owned by root
     mkdir -p target
 
-    docker build -t $target ci/docker/$target
+    if [ "$(uname -s)" = "Linux" ] && [ -z "${DOCKER_BASE_IMAGE:-}" ]; then
+      # Share the host rustc and target. Do this only on Linux and if the image
+      # isn't overridden
+      run_args=(
+           --user "$(id -u):$(id -g)"
+           -e "CARGO_HOME=/cargo"
+           -v "${HOME}/.cargo:/cargo"
+           -v "$(pwd)/target:/builtins-target" 
+           -v "$(rustc --print sysroot):/rust:ro"
+      )
+      run_cmd="HOME=/tmp PATH=\$PATH:/rust/bin ci/run.sh $target"
+    else
+      # Use rustc provided by a docker image
+      docker volume create compiler-builtins-cache
+      build_args=(
+        "--build-arg" "IMAGE=${DOCKER_BASE_IMAGE:-rustlang/rust:nightly}"
+      )
+      run_args=(
+        -v "compiler-builtins-cache:/builtins-target"
+      )
+      run_cmd="HOME=/tmp USING_CONTAINER_RUSTC=1 ci/run.sh $target"
+    fi
+
+    if [ -d compiler-rt ]; then
+      export RUST_COMPILER_RT_ROOT=./compiler-rt
+    fi
+
+    if [ "${GITHUB_ACTIONS:-}" = "true" ]; then
+      # Enable Docker image caching on GHA
+
+      build_cmd=("buildx" "build")
+      build_args=(
+        "--cache-from" "type=local,src=/tmp/.buildx-cache"
+        "--cache-to" "type=local,dest=/tmp/.buildx-cache-new"
+        # This is the beautiful bash syntax for expanding an array but neither
+        # raising an error nor returning an empty string if the array is empty.
+        "${build_args[@]:+"${build_args[@]}"}"
+        "--load"
+      )
+    fi
+
+    docker "${build_cmd[@]:-build}" \
+           -t "builtins-$target" \
+           "${build_args[@]:-}" \
+           "ci/docker/$target"
     docker run \
            --rm \
-           --user $(id -u):$(id -g) \
-           -e CARGO_HOME=/cargo \
-           -e CARGO_TARGET_DIR=/target \
            -e RUST_COMPILER_RT_ROOT \
-           -v "${HOME}/.cargo":/cargo \
-           -v `pwd`/target:/target \
-           -v `pwd`:/checkout:ro \
-           -v `rustc --print sysroot`:/rust:ro \
+           -e RUSTFLAGS \
+           -e "CARGO_TARGET_DIR=/builtins-target" \
+           -v "$(pwd):/checkout:ro" \
            -w /checkout \
+           "${run_args[@]:-}" \
            --init \
-           $target \
-           sh -c "HOME=/tmp PATH=\$PATH:/rust/bin ci/run.sh $target"
+           "builtins-$target" \
+           sh -c "$run_cmd"
 }
 
-if [ -z "$1" ]; then
-  for d in `ls ci/docker/`; do
-    run $d
+if [ "${1:-}" = "--help" ] || [ "$#" -gt 1 ]; then
+  set +x
+  echo "\
+    usage: ./ci/run-docker.sh [target]
+
+    you can also set DOCKER_BASE_IMAGE to use something other than the default
+    ubuntu:24.04 (or rustlang/rust:nightly).
+  "
+  exit
+fi
+
+if [ -z "${1:-}" ]; then
+  for d in ci/docker/*; do
+    run $(basename "$d")
   done
 else
-  run $1
+  run "$1"
 fi
diff --git a/ci/run.sh b/ci/run.sh
index 44ec30fb..057cdb08 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -1,30 +1,73 @@
-set -ex
+#!/bin/bash
 
-cargo=cargo
+set -eux
+
+target="${1:-}"
+
+export RUST_BACKTRACE="${RUST_BACKTRACE:-full}"
+
+if [ -z "$target" ]; then
+    host_target=$(rustc -vV | awk '/^host/ { print $2 }')
+    echo "Defaulted to host target $host_target"
+    target="$host_target"
+fi
+
+if [ "${USING_CONTAINER_RUSTC:-}" = 1 ]; then
+    # Install nonstandard components if we have control of the environment
+    rustup target list --installed |
+        grep -E "^$target\$" ||
+        rustup target add "$target"
+fi
 
 # Test our implementation
-if [ "$XARGO" = "1" ]; then
-    # FIXME: currently these tests don't work...
-    echo nothing to do
+if [ "${NO_STD:-}" = "1" ]; then
+    echo "nothing to do for no_std"
 else
-    run="cargo test --manifest-path testcrate/Cargo.toml --target $1"
+    run="cargo test --manifest-path testcrate/Cargo.toml --no-fail-fast --target $target"
     $run
     $run --release
     $run --features c
     $run --features c --release
     $run --features no-asm
     $run --features no-asm --release
+    $run --features no-f16-f128
+    $run --features no-f16-f128 --release
+    $run --benches
+    $run --benches --release
+fi
+
+if [ "${TEST_VERBATIM:-}" = "1" ]; then
+    verb_path=$(cmd.exe //C echo \\\\?\\%cd%\\testcrate\\target2)
+    cargo build --manifest-path testcrate/Cargo.toml \
+        --target "$target" --target-dir "$verb_path" --features c
 fi
 
-cargo build --target $1
-cargo build --target $1 --release
-cargo build --target $1 --features c
-cargo build --target $1 --release --features c
-cargo build --target $1 --features no-asm
-cargo build --target $1 --release --features no-asm
+declare -a rlib_paths
 
-PREFIX=$(echo $1 | sed -e 's/unknown-//')-
-case $1 in
+# Set the `rlib_paths` global array to a list of all compiler-builtins rlibs
+update_rlib_paths() {
+    if [ -d /builtins-target ]; then
+        rlib_paths=( /builtins-target/"${target}"/debug/deps/libcompiler_builtins-*.rlib )
+    else
+        rlib_paths=( target/"${target}"/debug/deps/libcompiler_builtins-*.rlib )
+    fi
+}
+
+# Remove any existing artifacts from previous tests that don't set #![compiler_builtins]
+update_rlib_paths
+rm -f "${rlib_paths[@]}"
+
+cargo build --target "$target"
+cargo build --target "$target" --release
+cargo build --target "$target" --features c
+cargo build --target "$target" --release --features c
+cargo build --target "$target" --features no-asm
+cargo build --target "$target" --release --features no-asm
+cargo build --target "$target" --features no-f16-f128
+cargo build --target "$target" --release --features no-f16-f128
+
+PREFIX=${target//unknown-/}-
+case "$target" in
     armv7-*)
         PREFIX=arm-linux-gnueabihf-
         ;;
@@ -36,76 +79,91 @@ case $1 in
         ;;
 esac
 
-NM=$(find $(rustc --print sysroot) -name llvm-nm)
+NM=$(find "$(rustc --print sysroot)" \( -name llvm-nm -o -name llvm-nm.exe \) )
 if [ "$NM" = "" ]; then
-  NM=${PREFIX}nm
+  NM="${PREFIX}nm"
 fi
 
-if [ -d /target ]; then
-    path=/target/${1}/debug/deps/libcompiler_builtins-*.rlib
-else
-    path=target/${1}/debug/deps/libcompiler_builtins-*.rlib
+# i686-pc-windows-gnu tools have a dependency on some DLLs, so run it with
+# rustup run to ensure that those are in PATH.
+TOOLCHAIN="$(rustup show active-toolchain | sed 's/ (default)//')"
+if [[ "$TOOLCHAIN" == *i686-pc-windows-gnu ]]; then
+  NM="rustup run $TOOLCHAIN $NM"
 fi
 
 # Look out for duplicated symbols when we include the compiler-rt (C) implementation
-for rlib in $(echo $path); do
+update_rlib_paths
+for rlib in "${rlib_paths[@]}"; do
     set +x
     echo "================================================================"
-    echo checking $rlib for duplicate symbols
+    echo "checking $rlib for duplicate symbols"
     echo "================================================================"
+    set -x
+    
+    duplicates_found=0
 
-    stdout=$($NM -g --defined-only $rlib 2>&1)
     # NOTE On i586, It's normal that the get_pc_thunk symbol appears several
     # times so ignore it
-    #
-    # FIXME(#167) - we shouldn't ignore `__builtin_cl` style symbols here.
-    set +e
-    echo "$stdout" | \
-      sort | \
-      uniq -d | \
-      grep -v __x86.get_pc_thunk | \
-      grep -v __builtin_cl | \
-      grep -v __builtin_ctz | \
-      grep 'T __'
-
-    if test $? = 0; then
+    $NM -g --defined-only "$rlib" 2>&1 |
+      sort |
+      uniq -d |
+      grep -v __x86.get_pc_thunk --quiet |
+      grep 'T __' && duplicates_found=1
+
+    if [ "$duplicates_found" != 0 ]; then
+        echo "error: found duplicate symbols"
         exit 1
+    else
+        echo "success; no duplicate symbols found"
     fi
-
-    set -ex
 done
 
-rm -f $path
+rm -f "${rlib_paths[@]}"
+
+build_intrinsics() {
+    cargo build --target "$target" -v --example intrinsics  "$@"
+}
 
 # Verify that we haven't drop any intrinsic/symbol
-build_intrinsics="$cargo build --target $1 -v --example intrinsics"
-RUSTFLAGS="-C debug-assertions=no" $build_intrinsics
-RUSTFLAGS="-C debug-assertions=no" $build_intrinsics --release
-RUSTFLAGS="-C debug-assertions=no" $build_intrinsics --features c
-RUSTFLAGS="-C debug-assertions=no" $build_intrinsics --features c --release
+build_intrinsics
+build_intrinsics --release
+build_intrinsics --features c
+build_intrinsics --features c --release
 
 # Verify that there are no undefined symbols to `panic` within our
 # implementations
-#
-# TODO(#79) fix the undefined references problem for debug-assertions+lto
-if [ -z "$DEBUG_LTO_BUILD_DOESNT_WORK" ]; then
-  RUSTFLAGS="-C debug-assertions=no" \
-    CARGO_INCREMENTAL=0 \
-    CARGO_PROFILE_DEV_LTO=true \
-    $cargo rustc --features "$INTRINSICS_FEATURES" --target $1 --example intrinsics
-fi
+CARGO_PROFILE_DEV_LTO=true \
+    cargo build --target "$target" --example intrinsics
 CARGO_PROFILE_RELEASE_LTO=true \
-  $cargo rustc --features "$INTRINSICS_FEATURES" --target $1 --example intrinsics --release
+    cargo build --target "$target" --example intrinsics --release
 
-# Ensure no references to a panicking function
-for rlib in $(echo $path); do
-    set +ex
-    $NM -u $rlib 2>&1 | grep panicking
-
-    if test $? = 0; then
+# Ensure no references to any symbols from core
+update_rlib_paths
+for rlib in "${rlib_paths[@]}"; do
+    set +x
+    echo "================================================================"
+    echo "checking $rlib for references to core"
+    echo "================================================================"
+    set -x
+
+    tmpdir="${CARGO_TARGET_DIR:-target}/tmp"
+    test -d "$tmpdir" || mkdir "$tmpdir"
+    defined="$tmpdir/defined_symbols.txt"
+    undefined="$tmpdir/defined_symbols.txt"
+
+    $NM --quiet -U "$rlib" | grep 'T _ZN4core' | awk '{print $3}' | sort | uniq > "$defined"
+    $NM --quiet -u "$rlib" | grep 'U _ZN4core' | awk '{print $2}' | sort | uniq > "$undefined"
+    grep_has_results=0
+    grep -v -F -x -f "$defined" "$undefined" && grep_has_results=1
+
+    if [ "$target" = "powerpc64-unknown-linux-gnu" ]; then
+        echo "FIXME: powerpc64 fails these tests"
+    elif [ "$grep_has_results" != 0 ]; then
+        echo "error: found unexpected references to core"
         exit 1
+    else
+        echo "success; no references to core found"
     fi
-    set -ex
 done
 
 true
diff --git a/configure.rs b/configure.rs
new file mode 100644
index 00000000..e23c0e83
--- /dev/null
+++ b/configure.rs
@@ -0,0 +1,92 @@
+// Configuration that is shared between `compiler_builtins` and `testcrate`.
+
+use std::env;
+
+#[derive(Debug)]
+#[allow(dead_code)]
+pub struct Target {
+    pub triple: String,
+    pub os: String,
+    pub arch: String,
+    pub vendor: String,
+    pub env: String,
+    pub pointer_width: u8,
+    pub little_endian: bool,
+    pub features: Vec<String>,
+}
+
+impl Target {
+    pub fn from_env() -> Self {
+        let little_endian = match env::var("CARGO_CFG_TARGET_ENDIAN").unwrap().as_str() {
+            "little" => true,
+            "big" => false,
+            x => panic!("unknown endian {x}"),
+        };
+
+        Self {
+            triple: env::var("TARGET").unwrap(),
+            os: env::var("CARGO_CFG_TARGET_OS").unwrap(),
+            arch: env::var("CARGO_CFG_TARGET_ARCH").unwrap(),
+            vendor: env::var("CARGO_CFG_TARGET_VENDOR").unwrap(),
+            env: env::var("CARGO_CFG_TARGET_ENV").unwrap(),
+            pointer_width: env::var("CARGO_CFG_TARGET_POINTER_WIDTH")
+                .unwrap()
+                .parse()
+                .unwrap(),
+            little_endian,
+            features: env::var("CARGO_CFG_TARGET_FEATURE")
+                .unwrap_or_default()
+                .split(",")
+                .map(ToOwned::to_owned)
+                .collect(),
+        }
+    }
+
+    #[allow(dead_code)]
+    pub fn has_feature(&self, feature: &str) -> bool {
+        self.features.iter().any(|f| f == feature)
+    }
+}
+
+/// Configure whether or not `f16` and `f128` support should be enabled.
+pub fn configure_f16_f128(target: &Target) {
+    // Set whether or not `f16` and `f128` are supported at a basic level by LLVM. This only means
+    // that the backend will not crash when using these types. This does not mean that the
+    // backend does the right thing, or that the platform doesn't have ABI bugs.
+    //
+    // We do this here rather than in `rust-lang/rust` because configuring via cargo features is
+    // not straightforward.
+    //
+    // Original source of this list:
+    // <https://github.com/rust-lang/compiler-builtins/pull/652#issuecomment-2266151350>
+    let (f16_ok, f128_ok) = match target.arch.as_str() {
+        // `f16` and `f128` both crash <https://github.com/llvm/llvm-project/issues/94434>
+        "arm64ec" => (false, false),
+        // `f16` crashes <https://github.com/llvm/llvm-project/issues/50374>
+        "s390x" => (false, true),
+        // `f128` crashes <https://github.com/llvm/llvm-project/issues/96432>
+        "mips64" | "mips64r6" => (true, false),
+        // `f128` crashes <https://github.com/llvm/llvm-project/issues/101545>
+        "powerpc64" if &target.os == "aix" => (true, false),
+        // `f128` crashes <https://github.com/llvm/llvm-project/issues/41838>
+        "sparc" | "sparcv9" => (true, false),
+        // `f16` miscompiles <https://github.com/llvm/llvm-project/issues/96438>
+        "wasm32" | "wasm64" => (false, true),
+        // Most everything else works as of LLVM 19
+        _ => (true, true),
+    };
+
+    // If the feature is set, disable these types.
+    let disable_both = env::var_os("CARGO_FEATURE_NO_F16_F128").is_some();
+
+    println!("cargo::rustc-check-cfg=cfg(f16_enabled)");
+    println!("cargo::rustc-check-cfg=cfg(f128_enabled)");
+
+    if f16_ok && !disable_both {
+        println!("cargo::rustc-cfg=f16_enabled");
+    }
+
+    if f128_ok && !disable_both {
+        println!("cargo::rustc-cfg=f128_enabled");
+    }
+}
diff --git a/crates/panic-handler/Cargo.toml b/crates/panic-handler/Cargo.toml
index 1dea613d..2ad85840 100644
--- a/crates/panic-handler/Cargo.toml
+++ b/crates/panic-handler/Cargo.toml
@@ -2,5 +2,7 @@
 name = "panic-handler"
 version = "0.1.0"
 authors = ["Alex Crichton <alex@alexcrichton.com>"]
+edition = "2021"
+publish = false
 
 [dependencies]
diff --git a/examples/intrinsics.rs b/examples/intrinsics.rs
index 19bb569b..595a8f20 100644
--- a/examples/intrinsics.rs
+++ b/examples/intrinsics.rs
@@ -4,18 +4,24 @@
 // to link due to the missing intrinsic (symbol).
 
 #![allow(unused_features)]
-#![allow(stable_features)] // bench_black_box feature is stable, leaving for backcompat
+#![allow(internal_features)]
 #![cfg_attr(thumb, no_main)]
 #![deny(dead_code)]
-#![feature(bench_black_box)]
+#![feature(allocator_api)]
+#![feature(f128)]
+#![feature(f16)]
 #![feature(lang_items)]
 #![feature(start)]
-#![feature(allocator_api)]
 #![no_std]
 
 extern crate panic_handler;
 
-#[cfg(all(not(thumb), not(windows), not(target_arch = "wasm32")))]
+#[cfg(all(
+    not(thumb),
+    not(windows),
+    not(target_arch = "wasm32"),
+    not(target_os = "solana")
+))]
 #[link(name = "c")]
 extern "C" {}
 
@@ -25,6 +31,101 @@ extern "C" {}
 // have an additional comment: the function name is the ARM name for the intrinsic and the comment
 // in the non-ARM name for the intrinsic.
 mod intrinsics {
+    /* f16 operations */
+
+    pub fn extendhfsf(x: f16) -> f32 {
+        x as f32
+    }
+
+    pub fn extendhfdf(x: f16) -> f64 {
+        x as f64
+    }
+
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    pub fn extendhftf(x: f16) -> f128 {
+        x as f128
+    }
+
+    /* f32 operations */
+
+    pub fn truncsfhf(x: f32) -> f16 {
+        x as f16
+    }
+
+    // extendsfdf2
+    pub fn aeabi_f2d(x: f32) -> f64 {
+        x as f64
+    }
+
+    pub fn extendsftf(x: f32) -> f128 {
+        x as f128
+    }
+
+    // fixsfsi
+    pub fn aeabi_f2iz(x: f32) -> i32 {
+        x as i32
+    }
+
+    // fixsfdi
+    pub fn aeabi_f2lz(x: f32) -> i64 {
+        x as i64
+    }
+
+    pub fn fixsfti(x: f32) -> i128 {
+        x as i128
+    }
+
+    // fixunssfsi
+    pub fn aeabi_f2uiz(x: f32) -> u32 {
+        x as u32
+    }
+
+    // fixunssfdi
+    pub fn aeabi_f2ulz(x: f32) -> u64 {
+        x as u64
+    }
+
+    pub fn fixunssfti(x: f32) -> u128 {
+        x as u128
+    }
+
+    // addsf3
+    pub fn aeabi_fadd(a: f32, b: f32) -> f32 {
+        a + b
+    }
+
+    // eqsf2
+    pub fn aeabi_fcmpeq(a: f32, b: f32) -> bool {
+        a == b
+    }
+
+    // gtsf2
+    pub fn aeabi_fcmpgt(a: f32, b: f32) -> bool {
+        a > b
+    }
+
+    // ltsf2
+    pub fn aeabi_fcmplt(a: f32, b: f32) -> bool {
+        a < b
+    }
+
+    // divsf3
+    pub fn aeabi_fdiv(a: f32, b: f32) -> f32 {
+        a / b
+    }
+
+    // mulsf3
+    pub fn aeabi_fmul(a: f32, b: f32) -> f32 {
+        a * b
+    }
+
+    // subsf3
+    pub fn aeabi_fsub(a: f32, b: f32) -> f32 {
+        a - b
+    }
+
+    /* f64 operations */
+
     // truncdfsf2
     pub fn aeabi_d2f(x: f64) -> f32 {
         x as f32
@@ -40,6 +141,10 @@ mod intrinsics {
         x as i64
     }
 
+    pub fn fixdfti(x: f64) -> i128 {
+        x as i128
+    }
+
     // fixunsdfsi
     pub fn aeabi_d2uiz(x: f64) -> u32 {
         x as u32
@@ -50,6 +155,10 @@ mod intrinsics {
         x as u64
     }
 
+    pub fn fixunsdfti(x: f64) -> u128 {
+        x as u128
+    }
+
     // adddf3
     pub fn aeabi_dadd(a: f64, b: f64) -> f64 {
         a + b
@@ -85,66 +194,81 @@ mod intrinsics {
         a - b
     }
 
-    // extendsfdf2
-    pub fn aeabi_f2d(x: f32) -> f64 {
+    /* f128 operations */
+
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    pub fn trunctfhf(x: f128) -> f16 {
+        x as f16
+    }
+
+    pub fn trunctfsf(x: f128) -> f32 {
+        x as f32
+    }
+
+    pub fn trunctfdf(x: f128) -> f64 {
         x as f64
     }
 
-    // fixsfsi
-    pub fn aeabi_f2iz(x: f32) -> i32 {
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    pub fn fixtfsi(x: f128) -> i32 {
         x as i32
     }
 
-    // fixsfdi
-    pub fn aeabi_f2lz(x: f32) -> i64 {
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    pub fn fixtfdi(x: f128) -> i64 {
         x as i64
     }
 
-    // fixunssfsi
-    pub fn aeabi_f2uiz(x: f32) -> u32 {
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    pub fn fixtfti(x: f128) -> i128 {
+        x as i128
+    }
+
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    pub fn fixunstfsi(x: f128) -> u32 {
         x as u32
     }
 
-    // fixunssfdi
-    pub fn aeabi_f2ulz(x: f32) -> u64 {
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    pub fn fixunstfdi(x: f128) -> u64 {
         x as u64
     }
 
-    // addsf3
-    pub fn aeabi_fadd(a: f32, b: f32) -> f32 {
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    pub fn fixunstfti(x: f128) -> u128 {
+        x as u128
+    }
+
+    pub fn addtf(a: f128, b: f128) -> f128 {
         a + b
     }
 
-    // eqsf2
-    pub fn aeabi_fcmpeq(a: f32, b: f32) -> bool {
+    pub fn eqtf(a: f128, b: f128) -> bool {
         a == b
     }
 
-    // gtsf2
-    pub fn aeabi_fcmpgt(a: f32, b: f32) -> bool {
+    pub fn gttf(a: f128, b: f128) -> bool {
         a > b
     }
 
-    // ltsf2
-    pub fn aeabi_fcmplt(a: f32, b: f32) -> bool {
+    pub fn lttf(a: f128, b: f128) -> bool {
         a < b
     }
 
-    // divsf3
-    pub fn aeabi_fdiv(a: f32, b: f32) -> f32 {
-        a / b
+    pub fn multf(a: f128, b: f128) -> f128 {
+        a * b
     }
 
-    // mulsf3
-    pub fn aeabi_fmul(a: f32, b: f32) -> f32 {
-        a * b
+    pub fn divtf(a: f128, b: f128) -> f128 {
+        a / b
     }
 
-    // subsf3
-    pub fn aeabi_fsub(a: f32, b: f32) -> f32 {
+    pub fn subtf(a: f128, b: f128) -> f128 {
         a - b
     }
 
+    /* i32 operations */
+
     // floatsidf
     pub fn aeabi_i2d(x: i32) -> f64 {
         x as f64
@@ -163,14 +287,20 @@ mod intrinsics {
         a % b
     }
 
+    /* i64 operations */
+
+    // floatdisf
+    pub fn aeabi_l2f(x: i64) -> f32 {
+        x as f32
+    }
+
     // floatdidf
     pub fn aeabi_l2d(x: i64) -> f64 {
         x as f64
     }
 
-    // floatdisf
-    pub fn aeabi_l2f(x: i64) -> f32 {
-        x as f32
+    pub fn mulodi4(a: i64, b: i64) -> i64 {
+        a * b
     }
 
     // divdi3
@@ -178,11 +308,31 @@ mod intrinsics {
         a / b
     }
 
+    pub fn moddi3(a: i64, b: i64) -> i64 {
+        a % b
+    }
+
     // muldi3
     pub fn aeabi_lmul(a: i64, b: i64) -> i64 {
         a.wrapping_mul(b)
     }
 
+    /* i128 operations */
+
+    pub fn lshrti3(a: i128, b: usize) -> i128 {
+        a >> b
+    }
+
+    pub fn divti3(a: i128, b: i128) -> i128 {
+        a / b
+    }
+
+    pub fn modti3(a: i128, b: i128) -> i128 {
+        a % b
+    }
+
+    /* u32 operations */
+
     // floatunsidf
     pub fn aeabi_ui2d(x: u32) -> f64 {
         x as f64
@@ -201,26 +351,20 @@ mod intrinsics {
         a % b
     }
 
-    // floatundidf
-    pub fn aeabi_ul2d(x: u64) -> f64 {
-        x as f64
-    }
+    /* u64 operations */
 
     // floatundisf
     pub fn aeabi_ul2f(x: u64) -> f32 {
         x as f32
     }
 
-    // udivdi3
-    pub fn aeabi_uldivmod(a: u64, b: u64) -> u64 {
-        a * b
-    }
-
-    pub fn moddi3(a: i64, b: i64) -> i64 {
-        a % b
+    // floatundidf
+    pub fn aeabi_ul2d(x: u64) -> f64 {
+        x as f64
     }
 
-    pub fn mulodi4(a: i64, b: i64) -> i64 {
+    // udivdi3
+    pub fn aeabi_uldivmod(a: u64, b: u64) -> u64 {
         a * b
     }
 
@@ -228,6 +372,8 @@ mod intrinsics {
         a % b
     }
 
+    /* u128 operations */
+
     pub fn muloti4(a: u128, b: u128) -> Option<u128> {
         a.checked_mul(b)
     }
@@ -244,10 +390,6 @@ mod intrinsics {
         a << b
     }
 
-    pub fn lshrti3(a: i128, b: usize) -> i128 {
-        a >> b
-    }
-
     pub fn udivti3(a: u128, b: u128) -> u128 {
         a / b
     }
@@ -255,24 +397,15 @@ mod intrinsics {
     pub fn umodti3(a: u128, b: u128) -> u128 {
         a % b
     }
-
-    pub fn divti3(a: i128, b: i128) -> i128 {
-        a / b
-    }
-
-    pub fn modti3(a: i128, b: i128) -> i128 {
-        a % b
-    }
-
-    pub fn udivsi3(a: u32, b: u32) -> u32 {
-        a / b
-    }
 }
 
 fn run() {
     use core::hint::black_box as bb;
     use intrinsics::*;
 
+    // FIXME(f16_f128): some PPC f128 <-> int conversion functions have the wrong names
+
+    bb(addtf(bb(2.), bb(2.)));
     bb(aeabi_d2f(bb(2.)));
     bb(aeabi_d2i(bb(2.)));
     bb(aeabi_d2l(bb(2.)));
@@ -312,19 +445,50 @@ fn run() {
     bb(aeabi_ul2d(bb(2)));
     bb(aeabi_ul2f(bb(2)));
     bb(aeabi_uldivmod(bb(2), bb(3)));
+    bb(ashlti3(bb(2), bb(2)));
+    bb(ashrti3(bb(2), bb(2)));
+    bb(divtf(bb(2.), bb(2.)));
+    bb(divti3(bb(2), bb(2)));
+    bb(eqtf(bb(2.), bb(2.)));
+    bb(extendhfdf(bb(2.)));
+    bb(extendhfsf(bb(2.)));
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    bb(extendhftf(bb(2.)));
+    bb(extendsftf(bb(2.)));
+    bb(fixdfti(bb(2.)));
+    bb(fixsfti(bb(2.)));
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    bb(fixtfdi(bb(2.)));
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    bb(fixtfsi(bb(2.)));
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    bb(fixtfti(bb(2.)));
+    bb(fixunsdfti(bb(2.)));
+    bb(fixunssfti(bb(2.)));
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    bb(fixunstfdi(bb(2.)));
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    bb(fixunstfsi(bb(2.)));
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    bb(fixunstfti(bb(2.)));
+    bb(gttf(bb(2.), bb(2.)));
+    bb(lshrti3(bb(2), bb(2)));
+    bb(lttf(bb(2.), bb(2.)));
     bb(moddi3(bb(2), bb(3)));
+    bb(modti3(bb(2), bb(2)));
     bb(mulodi4(bb(2), bb(3)));
-    bb(umoddi3(bb(2), bb(3)));
     bb(muloti4(bb(2), bb(2)));
+    bb(multf(bb(2.), bb(2.)));
     bb(multi3(bb(2), bb(2)));
-    bb(ashlti3(bb(2), bb(2)));
-    bb(ashrti3(bb(2), bb(2)));
-    bb(lshrti3(bb(2), bb(2)));
+    bb(subtf(bb(2.), bb(2.)));
+    bb(truncsfhf(bb(2.)));
+    bb(trunctfdf(bb(2.)));
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    bb(trunctfhf(bb(2.)));
+    bb(trunctfsf(bb(2.)));
     bb(udivti3(bb(2), bb(2)));
+    bb(umoddi3(bb(2), bb(3)));
     bb(umodti3(bb(2), bb(2)));
-    bb(divti3(bb(2), bb(2)));
-    bb(modti3(bb(2), bb(2)));
-    bb(udivsi3(bb(2), bb(2)));
 
     something_with_a_dtor(&|| assert_eq!(bb(1), 1));
 
diff --git a/libm b/libm
index 1dbb9d2d..300edb32 160000
--- a/libm
+++ b/libm
@@ -1 +1 @@
-Subproject commit 1dbb9d2d476d65d020feca17b11391652038e2e1
+Subproject commit 300edb32520b1673e16d2411a0e2e6273959eb46
diff --git a/src/aarch64.rs b/src/aarch64.rs
new file mode 100644
index 00000000..cce485c4
--- /dev/null
+++ b/src/aarch64.rs
@@ -0,0 +1,21 @@
+#![allow(unused_imports)]
+
+use core::intrinsics;
+
+intrinsics! {
+    #[naked]
+    #[cfg(all(target_os = "uefi", not(feature = "no-asm")))]
+    pub unsafe extern "C" fn __chkstk() {
+        core::arch::naked_asm!(
+            ".p2align 2",
+            "lsl    x16, x15, #4",
+            "mov    x17, sp",
+            "1:",
+            "sub    x17, x17, 4096",
+            "subs   x16, x16, 4096",
+            "ldr    xzr, [x17]",
+            "b.gt   1b",
+            "ret",
+        );
+    }
+}
diff --git a/src/aarch64_linux.rs b/src/aarch64_linux.rs
index 62144e53..caac3e60 100644
--- a/src/aarch64_linux.rs
+++ b/src/aarch64_linux.rs
@@ -136,7 +136,7 @@ macro_rules! compare_and_swap {
                 expected: int_ty!($bytes), desired: int_ty!($bytes), ptr: *mut int_ty!($bytes)
             ) -> int_ty!($bytes) {
                 // We can't use `AtomicI8::compare_and_swap`; we *are* compare_and_swap.
-                unsafe { core::arch::asm! {
+                unsafe { core::arch::naked_asm! {
                     // UXT s(tmp0), s(0)
                     concat!(uxt!($bytes), " ", reg!($bytes, 16), ", ", reg!($bytes, 0)),
                     "0:",
@@ -150,7 +150,6 @@ macro_rules! compare_and_swap {
                     "cbnz   w17, 0b",
                     "1:",
                     "ret",
-                    options(noreturn)
                 } }
             }
         }
@@ -166,7 +165,7 @@ macro_rules! compare_and_swap_i128 {
             pub unsafe extern "C" fn $name (
                 expected: i128, desired: i128, ptr: *mut i128
             ) -> i128 {
-                unsafe { core::arch::asm! {
+                unsafe { core::arch::naked_asm! {
                     "mov    x16, x0",
                     "mov    x17, x1",
                     "0:",
@@ -180,7 +179,6 @@ macro_rules! compare_and_swap_i128 {
                     "cbnz   w15, 0b",
                     "1:",
                     "ret",
-                    options(noreturn)
                 } }
             }
         }
@@ -196,7 +194,7 @@ macro_rules! swap {
             pub unsafe extern "C" fn $name (
                 left: int_ty!($bytes), right_ptr: *mut int_ty!($bytes)
             ) -> int_ty!($bytes) {
-                unsafe { core::arch::asm! {
+                unsafe { core::arch::naked_asm! {
                     // mov    s(tmp0), s(0)
                     concat!("mov ", reg!($bytes, 16), ", ", reg!($bytes, 0)),
                     "0:",
@@ -206,7 +204,6 @@ macro_rules! swap {
                     concat!(stxr!($ordering, $bytes), " w17, ", reg!($bytes, 16), ", [x1]"),
                     "cbnz   w17, 0b",
                     "ret",
-                    options(noreturn)
                 } }
             }
         }
@@ -222,7 +219,7 @@ macro_rules! fetch_op {
             pub unsafe extern "C" fn $name (
                 val: int_ty!($bytes), ptr: *mut int_ty!($bytes)
             ) -> int_ty!($bytes) {
-                unsafe { core::arch::asm! {
+                unsafe { core::arch::naked_asm! {
                     // mov    s(tmp0), s(0)
                     concat!("mov ", reg!($bytes, 16), ", ", reg!($bytes, 0)),
                     "0:",
@@ -234,7 +231,6 @@ macro_rules! fetch_op {
                     concat!(stxr!($ordering, $bytes), " w15, ", reg!($bytes, 17), ", [x1]"),
                     "cbnz  w15, 0b",
                     "ret",
-                    options(noreturn)
                 } }
             }
         }
diff --git a/src/arm.rs b/src/arm.rs
index a062a54e..9e660839 100644
--- a/src/arm.rs
+++ b/src/arm.rs
@@ -3,14 +3,14 @@
 
 use core::intrinsics;
 
-// iOS symbols have a leading underscore.
-#[cfg(target_os = "ios")]
+// Apple symbols have a leading underscore.
+#[cfg(target_vendor = "apple")]
 macro_rules! bl {
     ($func:literal) => {
         concat!("bl _", $func)
     };
 }
-#[cfg(not(target_os = "ios"))]
+#[cfg(not(target_vendor = "apple"))]
 macro_rules! bl {
     ($func:literal) => {
         concat!("bl ", $func)
@@ -20,11 +20,10 @@ macro_rules! bl {
 intrinsics! {
     // NOTE This function and the ones below are implemented using assembly because they are using a
     // custom calling convention which can't be implemented using a normal Rust function.
-    #[cfg_attr(all(not(windows), not(target_vendor="apple")), weak)]
     #[naked]
     #[cfg(not(target_env = "msvc"))]
     pub unsafe extern "C" fn __aeabi_uidivmod() {
-        core::arch::asm!(
+        core::arch::naked_asm!(
             "push {{lr}}",
             "sub sp, sp, #4",
             "mov r2, sp",
@@ -32,14 +31,12 @@ intrinsics! {
             "ldr r1, [sp]",
             "add sp, sp, #4",
             "pop {{pc}}",
-            options(noreturn)
         );
     }
 
-    #[cfg_attr(all(not(windows), not(target_vendor="apple")), weak)]
     #[naked]
     pub unsafe extern "C" fn __aeabi_uldivmod() {
-        core::arch::asm!(
+        core::arch::naked_asm!(
             "push {{r4, lr}}",
             "sub sp, sp, #16",
             "add r4, sp, #8",
@@ -49,28 +46,24 @@ intrinsics! {
             "ldr r3, [sp, #12]",
             "add sp, sp, #16",
             "pop {{r4, pc}}",
-            options(noreturn)
         );
     }
 
-    #[cfg_attr(all(not(windows), not(target_vendor="apple")), weak)]
     #[naked]
     pub unsafe extern "C" fn __aeabi_idivmod() {
-        core::arch::asm!(
+        core::arch::naked_asm!(
             "push {{r0, r1, r4, lr}}",
             bl!("__aeabi_idiv"),
             "pop {{r1, r2}}",
             "muls r2, r2, r0",
             "subs r1, r1, r2",
             "pop {{r4, pc}}",
-            options(noreturn)
         );
     }
 
-    #[cfg_attr(all(not(windows), not(target_vendor="apple")), weak)]
     #[naked]
     pub unsafe extern "C" fn __aeabi_ldivmod() {
-        core::arch::asm!(
+        core::arch::naked_asm!(
             "push {{r4, lr}}",
             "sub sp, sp, #16",
             "add r4, sp, #8",
@@ -80,22 +73,17 @@ intrinsics! {
             "ldr r3, [sp, #12]",
             "add sp, sp, #16",
             "pop {{r4, pc}}",
-            options(noreturn)
         );
     }
 
-    // The following functions use weak linkage to allow users to override
-    // with custom implementation.
     // FIXME: The `*4` and `*8` variants should be defined as aliases.
 
-    #[weak]
-    #[cfg(not(target_os = "ios"))]
+    #[cfg(not(target_vendor = "apple"))]
     pub unsafe extern "aapcs" fn __aeabi_memcpy(dest: *mut u8, src: *const u8, n: usize) {
-        ::mem::memcpy(dest, src, n);
+        crate::mem::memcpy(dest, src, n);
     }
 
-    #[weak]
-    #[cfg(not(target_os = "ios"))]
+    #[cfg(not(target_vendor = "apple"))]
     pub unsafe extern "aapcs" fn __aeabi_memcpy4(dest: *mut u8, src: *const u8, n: usize) {
         // We are guaranteed 4-alignment, so accessing at u32 is okay.
         let mut dest = dest as *mut u32;
@@ -112,39 +100,33 @@ intrinsics! {
         __aeabi_memcpy(dest as *mut u8, src as *const u8, n);
     }
 
-    #[weak]
-    #[cfg(not(target_os = "ios"))]
+    #[cfg(not(target_vendor = "apple"))]
     pub unsafe extern "aapcs" fn __aeabi_memcpy8(dest: *mut u8, src: *const u8, n: usize) {
         __aeabi_memcpy4(dest, src, n);
     }
 
-    #[weak]
-    #[cfg(not(target_os = "ios"))]
+    #[cfg(not(target_vendor = "apple"))]
     pub unsafe extern "aapcs" fn __aeabi_memmove(dest: *mut u8, src: *const u8, n: usize) {
-        ::mem::memmove(dest, src, n);
+        crate::mem::memmove(dest, src, n);
     }
 
-    #[weak]
-    #[cfg(not(any(target_os = "ios", target_env = "msvc")))]
+    #[cfg(not(any(target_vendor = "apple", target_env = "msvc")))]
     pub unsafe extern "aapcs" fn __aeabi_memmove4(dest: *mut u8, src: *const u8, n: usize) {
         __aeabi_memmove(dest, src, n);
     }
 
-    #[weak]
-    #[cfg(not(any(target_os = "ios", target_env = "msvc")))]
+    #[cfg(not(any(target_vendor = "apple", target_env = "msvc")))]
     pub unsafe extern "aapcs" fn __aeabi_memmove8(dest: *mut u8, src: *const u8, n: usize) {
         __aeabi_memmove(dest, src, n);
     }
 
-    #[weak]
-    #[cfg(not(target_os = "ios"))]
+    #[cfg(not(target_vendor = "apple"))]
     pub unsafe extern "aapcs" fn __aeabi_memset(dest: *mut u8, n: usize, c: i32) {
         // Note the different argument order
-        ::mem::memset(dest, c, n);
+        crate::mem::memset(dest, c, n);
     }
 
-    #[weak]
-    #[cfg(not(target_os = "ios"))]
+    #[cfg(not(target_vendor = "apple"))]
     pub unsafe extern "aapcs" fn __aeabi_memset4(dest: *mut u8, n: usize, c: i32) {
         let mut dest = dest as *mut u32;
         let mut n = n;
@@ -161,26 +143,22 @@ intrinsics! {
         __aeabi_memset(dest as *mut u8, n, byte as i32);
     }
 
-    #[weak]
-    #[cfg(not(target_os = "ios"))]
+    #[cfg(not(target_vendor = "apple"))]
     pub unsafe extern "aapcs" fn __aeabi_memset8(dest: *mut u8, n: usize, c: i32) {
         __aeabi_memset4(dest, n, c);
     }
 
-    #[weak]
-    #[cfg(not(target_os = "ios"))]
+    #[cfg(not(target_vendor = "apple"))]
     pub unsafe extern "aapcs" fn __aeabi_memclr(dest: *mut u8, n: usize) {
         __aeabi_memset(dest, n, 0);
     }
 
-    #[weak]
-    #[cfg(not(any(target_os = "ios", target_env = "msvc")))]
+    #[cfg(not(any(target_vendor = "apple", target_env = "msvc")))]
     pub unsafe extern "aapcs" fn __aeabi_memclr4(dest: *mut u8, n: usize) {
         __aeabi_memset4(dest, n, 0);
     }
 
-    #[weak]
-    #[cfg(not(any(target_os = "ios", target_env = "msvc")))]
+    #[cfg(not(any(target_vendor = "apple", target_env = "msvc")))]
     pub unsafe extern "aapcs" fn __aeabi_memclr8(dest: *mut u8, n: usize) {
         __aeabi_memset4(dest, n, 0);
     }
diff --git a/src/float/add.rs b/src/float/add.rs
index 67f6c2c1..bceef7b0 100644
--- a/src/float/add.rs
+++ b/src/float/add.rs
@@ -1,5 +1,5 @@
-use float::Float;
-use int::{CastInto, Int};
+use crate::float::Float;
+use crate::int::{CastInto, Int, MinInt};
 
 /// Returns `a + b`
 fn add<F: Float>(a: F, b: F) -> F
@@ -57,9 +57,9 @@ where
         }
 
         // zero + anything = anything
-        if a_abs == Int::ZERO {
+        if a_abs == MinInt::ZERO {
             // but we need to get the sign right for zero + zero
-            if b_abs == Int::ZERO {
+            if b_abs == MinInt::ZERO {
                 return F::from_repr(a.repr() & b.repr());
             } else {
                 return b;
@@ -67,7 +67,7 @@ where
         }
 
         // anything + zero = anything
-        if b_abs == Int::ZERO {
+        if b_abs == MinInt::ZERO {
             return a;
         }
     }
@@ -113,10 +113,10 @@ where
     // Shift the significand of b by the difference in exponents, with a sticky
     // bottom bit to get rounding correct.
     let align = a_exponent.wrapping_sub(b_exponent).cast();
-    if align != Int::ZERO {
+    if align != MinInt::ZERO {
         if align < bits {
             let sticky =
-                F::Int::from_bool(b_significand << bits.wrapping_sub(align).cast() != Int::ZERO);
+                F::Int::from_bool(b_significand << bits.wrapping_sub(align).cast() != MinInt::ZERO);
             b_significand = (b_significand >> align.cast()) | sticky;
         } else {
             b_significand = one; // sticky; b is known to be non-zero.
@@ -125,8 +125,8 @@ where
     if subtraction {
         a_significand = a_significand.wrapping_sub(b_significand);
         // If a == -b, return +zero.
-        if a_significand == Int::ZERO {
-            return F::from_repr(Int::ZERO);
+        if a_significand == MinInt::ZERO {
+            return F::from_repr(MinInt::ZERO);
         }
 
         // If partial cancellation occured, we need to left-shift the result
@@ -143,8 +143,8 @@ where
 
         // If the addition carried up, we need to right-shift the result and
         // adjust the exponent:
-        if a_significand & implicit_bit << 4 != Int::ZERO {
-            let sticky = F::Int::from_bool(a_significand & one != Int::ZERO);
+        if a_significand & implicit_bit << 4 != MinInt::ZERO {
+            let sticky = F::Int::from_bool(a_significand & one != MinInt::ZERO);
             a_significand = a_significand >> 1 | sticky;
             a_exponent += 1;
         }
@@ -160,7 +160,7 @@ where
         // need to shift the significand.
         let shift = (1 - a_exponent).cast();
         let sticky =
-            F::Int::from_bool((a_significand << bits.wrapping_sub(shift).cast()) != Int::ZERO);
+            F::Int::from_bool((a_significand << bits.wrapping_sub(shift).cast()) != MinInt::ZERO);
         a_significand = a_significand >> shift.cast() | sticky;
         a_exponent = 0;
     }
@@ -189,25 +189,23 @@ where
 }
 
 intrinsics! {
+    #[avr_skip]
     #[aapcs_on_arm]
     #[arm_aeabi_alias = __aeabi_fadd]
     pub extern "C" fn __addsf3(a: f32, b: f32) -> f32 {
         add(a, b)
     }
 
+    #[avr_skip]
     #[aapcs_on_arm]
     #[arm_aeabi_alias = __aeabi_dadd]
     pub extern "C" fn __adddf3(a: f64, b: f64) -> f64 {
         add(a, b)
     }
 
-    #[cfg(target_arch = "arm")]
-    pub extern "C" fn __addsf3vfp(a: f32, b: f32) -> f32 {
-        a + b
-    }
-
-    #[cfg(target_arch = "arm")]
-    pub extern "C" fn __adddf3vfp(a: f64, b: f64) -> f64 {
-        a + b
+    #[ppc_alias = __addkf3]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __addtf3(a: f128, b: f128) -> f128 {
+        add(a, b)
     }
 }
diff --git a/src/float/cmp.rs b/src/float/cmp.rs
index 1bd7aa28..bb7d4b49 100644
--- a/src/float/cmp.rs
+++ b/src/float/cmp.rs
@@ -1,7 +1,7 @@
 #![allow(unreachable_code)]
 
-use float::Float;
-use int::Int;
+use crate::float::Float;
+use crate::int::MinInt;
 
 #[derive(Clone, Copy)]
 enum Result {
@@ -172,6 +172,51 @@ intrinsics! {
     }
 }
 
+#[cfg(f128_enabled)]
+intrinsics! {
+    #[avr_skip]
+    #[ppc_alias = __lekf2]
+    pub extern "C" fn __letf2(a: f128, b: f128) -> i32 {
+        cmp(a, b).to_le_abi()
+    }
+
+    #[avr_skip]
+    #[ppc_alias = __gekf2]
+    pub extern "C" fn __getf2(a: f128, b: f128) -> i32 {
+        cmp(a, b).to_ge_abi()
+    }
+
+    #[avr_skip]
+    #[ppc_alias = __unordkf2]
+    pub extern "C" fn __unordtf2(a: f128, b: f128) -> i32 {
+        unord(a, b) as i32
+    }
+
+    #[avr_skip]
+    #[ppc_alias = __eqkf2]
+    pub extern "C" fn __eqtf2(a: f128, b: f128) -> i32 {
+        cmp(a, b).to_le_abi()
+    }
+
+    #[avr_skip]
+    #[ppc_alias = __ltkf2]
+    pub extern "C" fn __lttf2(a: f128, b: f128) -> i32 {
+        cmp(a, b).to_le_abi()
+    }
+
+    #[avr_skip]
+    #[ppc_alias = __nekf2]
+    pub extern "C" fn __netf2(a: f128, b: f128) -> i32 {
+        cmp(a, b).to_le_abi()
+    }
+
+    #[avr_skip]
+    #[ppc_alias = __gtkf2]
+    pub extern "C" fn __gttf2(a: f128, b: f128) -> i32 {
+        cmp(a, b).to_ge_abi()
+    }
+}
+
 #[cfg(target_arch = "arm")]
 intrinsics! {
     pub extern "aapcs" fn __aeabi_fcmple(a: f32, b: f32) -> i32 {
@@ -213,55 +258,4 @@ intrinsics! {
     pub extern "aapcs" fn __aeabi_dcmpgt(a: f64, b: f64) -> i32 {
         (__gtdf2(a, b) > 0) as i32
     }
-
-    // On hard-float targets LLVM will use native instructions
-    // for all VFP intrinsics below
-
-    pub extern "C" fn __gesf2vfp(a: f32, b: f32) -> i32 {
-        (a >= b) as i32
-    }
-
-    pub extern "C" fn __gedf2vfp(a: f64, b: f64) -> i32 {
-        (a >= b) as i32
-    }
-
-    pub extern "C" fn __gtsf2vfp(a: f32, b: f32) -> i32 {
-        (a > b) as i32
-    }
-
-    pub extern "C" fn __gtdf2vfp(a: f64, b: f64) -> i32 {
-        (a > b) as i32
-    }
-
-    pub extern "C" fn __ltsf2vfp(a: f32, b: f32) -> i32 {
-        (a < b) as i32
-    }
-
-    pub extern "C" fn __ltdf2vfp(a: f64, b: f64) -> i32 {
-        (a < b) as i32
-    }
-
-    pub extern "C" fn __lesf2vfp(a: f32, b: f32) -> i32 {
-        (a <= b) as i32
-    }
-
-    pub extern "C" fn __ledf2vfp(a: f64, b: f64) -> i32 {
-        (a <= b) as i32
-    }
-
-    pub extern "C" fn __nesf2vfp(a: f32, b: f32) -> i32 {
-        (a != b) as i32
-    }
-
-    pub extern "C" fn __nedf2vfp(a: f64, b: f64) -> i32 {
-        (a != b) as i32
-    }
-
-    pub extern "C" fn __eqsf2vfp(a: f32, b: f32) -> i32 {
-        (a == b) as i32
-    }
-
-    pub extern "C" fn __eqdf2vfp(a: f64, b: f64) -> i32 {
-        (a == b) as i32
-    }
 }
diff --git a/src/float/conv.rs b/src/float/conv.rs
index 790c0ab9..d275f982 100644
--- a/src/float/conv.rs
+++ b/src/float/conv.rs
@@ -1,3 +1,9 @@
+use core::ops::Neg;
+
+use crate::int::{CastFrom, CastInto, Int, MinInt};
+
+use super::Float;
+
 /// Conversions from integers to floats.
 ///
 /// These are hand-optimized bit twiddling code,
@@ -142,102 +148,136 @@ intrinsics! {
     }
 }
 
+/// Generic float to unsigned int conversions.
+fn float_to_unsigned_int<F, U>(f: F) -> U
+where
+    F: Float,
+    U: Int<UnsignedInt = U>,
+    F::Int: CastInto<U>,
+    F::Int: CastFrom<u32>,
+    F::Int: CastInto<U::UnsignedInt>,
+    u32: CastFrom<F::Int>,
+{
+    float_to_int_inner::<F, U, _, _>(f.repr(), |i: U| i, || U::MAX)
+}
+
+/// Generic float to signed int conversions.
+fn float_to_signed_int<F, I>(f: F) -> I
+where
+    F: Float,
+    I: Int + Neg<Output = I>,
+    I::UnsignedInt: Int,
+    F::Int: CastInto<I::UnsignedInt>,
+    F::Int: CastFrom<u32>,
+    u32: CastFrom<F::Int>,
+{
+    float_to_int_inner::<F, I, _, _>(
+        f.repr() & !F::SIGN_MASK,
+        |i: I| if f.is_sign_negative() { -i } else { i },
+        || if f.is_sign_negative() { I::MIN } else { I::MAX },
+    )
+}
+
+/// Float to int conversions, generic for both signed and unsigned.
+///
+/// Parameters:
+/// - `fbits`: `abg(f)` bitcasted to an integer.
+/// - `map_inbounds`: apply this transformation to integers that are within range (add the sign
+///    back).
+/// - `out_of_bounds`: return value when out of range for `I`.
+fn float_to_int_inner<F, I, FnFoo, FnOob>(
+    fbits: F::Int,
+    map_inbounds: FnFoo,
+    out_of_bounds: FnOob,
+) -> I
+where
+    F: Float,
+    I: Int,
+    FnFoo: FnOnce(I) -> I,
+    FnOob: FnOnce() -> I,
+    I::UnsignedInt: Int,
+    F::Int: CastInto<I::UnsignedInt>,
+    F::Int: CastFrom<u32>,
+    u32: CastFrom<F::Int>,
+{
+    let int_max_exp = F::EXPONENT_BIAS + I::MAX.ilog2() + 1;
+    let foobar = F::EXPONENT_BIAS + I::UnsignedInt::BITS - 1;
+
+    if fbits < F::ONE.repr() {
+        // < 0 gets rounded to 0
+        I::ZERO
+    } else if fbits < F::Int::cast_from(int_max_exp) << F::SIGNIFICAND_BITS {
+        // >= 1, < integer max
+        let m_base = if I::UnsignedInt::BITS >= F::Int::BITS {
+            I::UnsignedInt::cast_from(fbits) << (I::BITS - F::SIGNIFICAND_BITS - 1)
+        } else {
+            I::UnsignedInt::cast_from(fbits >> (F::SIGNIFICAND_BITS - I::BITS + 1))
+        };
+
+        // Set the implicit 1-bit.
+        let m: I::UnsignedInt = I::UnsignedInt::ONE << (I::BITS - 1) | m_base;
+
+        // Shift based on the exponent and bias.
+        let s: u32 = (foobar) - u32::cast_from(fbits >> F::SIGNIFICAND_BITS);
+
+        let unsigned = m >> s;
+        map_inbounds(I::from_unsigned(unsigned))
+    } else if fbits <= F::EXPONENT_MASK {
+        // >= max (incl. inf)
+        out_of_bounds()
+    } else {
+        I::ZERO
+    }
+}
+
 // Conversions from floats to unsigned integers.
 intrinsics! {
     #[arm_aeabi_alias = __aeabi_f2uiz]
     pub extern "C" fn __fixunssfsi(f: f32) -> u32 {
-        let fbits = f.to_bits();
-        if fbits < 127 << 23 { // >= 0, < 1
-            0
-        } else if fbits < 159 << 23 { // >= 1, < max
-            let m = 1 << 31 | fbits << 8; // Mantissa and the implicit 1-bit.
-            let s = 158 - (fbits >> 23); // Shift based on the exponent and bias.
-            m >> s
-        } else if fbits <= 255 << 23 { // >= max (incl. inf)
-            u32::MAX
-        } else { // Negative or NaN
-            0
-        }
+        float_to_unsigned_int(f)
     }
 
     #[arm_aeabi_alias = __aeabi_f2ulz]
     pub extern "C" fn __fixunssfdi(f: f32) -> u64 {
-        let fbits = f.to_bits();
-        if fbits < 127 << 23 { // >= 0, < 1
-            0
-        } else if fbits < 191 << 23 { // >= 1, < max
-            let m = 1 << 63 | (fbits as u64) << 40; // Mantissa and the implicit 1-bit.
-            let s = 190 - (fbits >> 23); // Shift based on the exponent and bias.
-            m >> s
-        } else if fbits <= 255 << 23 { // >= max (incl. inf)
-            u64::MAX
-        } else { // Negative or NaN
-            0
-        }
+        float_to_unsigned_int(f)
     }
 
     #[win64_128bit_abi_hack]
     pub extern "C" fn __fixunssfti(f: f32) -> u128 {
-        let fbits = f.to_bits();
-        if fbits < 127 << 23 { // >= 0, < 1
-            0
-        } else if fbits < 255 << 23 { // >= 1, < inf
-            let m = 1 << 127 | (fbits as u128) << 104; // Mantissa and the implicit 1-bit.
-            let s = 254 - (fbits >> 23); // Shift based on the exponent and bias.
-            m >> s
-        } else if fbits == 255 << 23 { // == inf
-            u128::MAX
-        } else { // Negative or NaN
-            0
-        }
+        float_to_unsigned_int(f)
     }
 
     #[arm_aeabi_alias = __aeabi_d2uiz]
     pub extern "C" fn __fixunsdfsi(f: f64) -> u32 {
-        let fbits = f.to_bits();
-        if fbits < 1023 << 52 { // >= 0, < 1
-            0
-        } else if fbits < 1055 << 52 { // >= 1, < max
-            let m = 1 << 31 | (fbits >> 21) as u32; // Mantissa and the implicit 1-bit.
-            let s = 1054 - (fbits >> 52); // Shift based on the exponent and bias.
-            m >> s
-        } else if fbits <= 2047 << 52 { // >= max (incl. inf)
-            u32::MAX
-        } else { // Negative or NaN
-            0
-        }
+        float_to_unsigned_int(f)
     }
 
     #[arm_aeabi_alias = __aeabi_d2ulz]
     pub extern "C" fn __fixunsdfdi(f: f64) -> u64 {
-        let fbits = f.to_bits();
-        if fbits < 1023 << 52 { // >= 0, < 1
-            0
-        } else if fbits < 1087 << 52 { // >= 1, < max
-            let m = 1 << 63 | fbits << 11; // Mantissa and the implicit 1-bit.
-            let s = 1086 - (fbits >> 52); // Shift based on the exponent and bias.
-            m >> s
-        } else if fbits <= 2047 << 52 { // >= max (incl. inf)
-            u64::MAX
-        } else { // Negative or NaN
-            0
-        }
+        float_to_unsigned_int(f)
     }
 
     #[win64_128bit_abi_hack]
     pub extern "C" fn __fixunsdfti(f: f64) -> u128 {
-        let fbits = f.to_bits();
-        if fbits < 1023 << 52 { // >= 0, < 1
-            0
-        } else if fbits < 1151 << 52 { // >= 1, < max
-            let m = 1 << 127 | (fbits as u128) << 75; // Mantissa and the implicit 1-bit.
-            let s = 1150 - (fbits >> 52); // Shift based on the exponent and bias.
-            m >> s
-        } else if fbits <= 2047 << 52 { // >= max (incl. inf)
-            u128::MAX
-        } else { // Negative or NaN
-            0
-        }
+        float_to_unsigned_int(f)
+    }
+
+    #[ppc_alias = __fixunskfsi]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __fixunstfsi(f: f128) -> u32 {
+        float_to_unsigned_int(f)
+    }
+
+    #[ppc_alias = __fixunskfdi]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __fixunstfdi(f: f128) -> u64 {
+        float_to_unsigned_int(f)
+    }
+
+    #[ppc_alias = __fixunskfti]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __fixunstfti(f: f128) -> u128 {
+        float_to_unsigned_int(f)
     }
 }
 
@@ -245,103 +285,49 @@ intrinsics! {
 intrinsics! {
     #[arm_aeabi_alias = __aeabi_f2iz]
     pub extern "C" fn __fixsfsi(f: f32) -> i32 {
-        let fbits = f.to_bits() & !0 >> 1; // Remove sign bit.
-        if fbits < 127 << 23 { // >= 0, < 1
-            0
-        } else if fbits < 158 << 23 { // >= 1, < max
-            let m = 1 << 31 | fbits << 8; // Mantissa and the implicit 1-bit.
-            let s = 158 - (fbits >> 23); // Shift based on the exponent and bias.
-            let u = (m >> s) as i32; // Unsigned result.
-            if f.is_sign_negative() { -u } else { u }
-        } else if fbits <= 255 << 23 { // >= max (incl. inf)
-            if f.is_sign_negative() { i32::MIN } else { i32::MAX }
-        } else { // NaN
-            0
-        }
+        float_to_signed_int(f)
     }
 
     #[arm_aeabi_alias = __aeabi_f2lz]
     pub extern "C" fn __fixsfdi(f: f32) -> i64 {
-        let fbits = f.to_bits() & !0 >> 1; // Remove sign bit.
-        if fbits < 127 << 23 { // >= 0, < 1
-            0
-        } else if fbits < 190 << 23 { // >= 1, < max
-            let m = 1 << 63 | (fbits as u64) << 40; // Mantissa and the implicit 1-bit.
-            let s = 190 - (fbits >> 23); // Shift based on the exponent and bias.
-            let u = (m >> s) as i64; // Unsigned result.
-            if f.is_sign_negative() { -u } else { u }
-        } else if fbits <= 255 << 23 { // >= max (incl. inf)
-            if f.is_sign_negative() { i64::MIN } else { i64::MAX }
-        } else { // NaN
-            0
-        }
+        float_to_signed_int(f)
     }
 
     #[win64_128bit_abi_hack]
     pub extern "C" fn __fixsfti(f: f32) -> i128 {
-        let fbits = f.to_bits() & !0 >> 1; // Remove sign bit.
-        if fbits < 127 << 23 { // >= 0, < 1
-            0
-        } else if fbits < 254 << 23 { // >= 1, < max
-            let m = 1 << 127 | (fbits as u128) << 104; // Mantissa and the implicit 1-bit.
-            let s = 254 - (fbits >> 23); // Shift based on the exponent and bias.
-            let u = (m >> s) as i128; // Unsigned result.
-            if f.is_sign_negative() { -u } else { u }
-        } else if fbits <= 255 << 23 { // >= max (incl. inf)
-            if f.is_sign_negative() { i128::MIN } else { i128::MAX }
-        } else { // NaN
-            0
-        }
+        float_to_signed_int(f)
     }
 
     #[arm_aeabi_alias = __aeabi_d2iz]
     pub extern "C" fn __fixdfsi(f: f64) -> i32 {
-        let fbits = f.to_bits() & !0 >> 1; // Remove sign bit.
-        if fbits < 1023 << 52 { // >= 0, < 1
-            0
-        } else if fbits < 1054 << 52 { // >= 1, < max
-            let m = 1 << 31 | (fbits >> 21) as u32; // Mantissa and the implicit 1-bit.
-            let s = 1054 - (fbits >> 52); // Shift based on the exponent and bias.
-            let u = (m >> s) as i32; // Unsigned result.
-            if f.is_sign_negative() { -u } else { u }
-        } else if fbits <= 2047 << 52 { // >= max (incl. inf)
-            if f.is_sign_negative() { i32::MIN } else { i32::MAX }
-        } else { // NaN
-            0
-        }
+        float_to_signed_int(f)
     }
 
     #[arm_aeabi_alias = __aeabi_d2lz]
     pub extern "C" fn __fixdfdi(f: f64) -> i64 {
-        let fbits = f.to_bits() & !0 >> 1; // Remove sign bit.
-        if fbits < 1023 << 52 { // >= 0, < 1
-            0
-        } else if fbits < 1086 << 52 { // >= 1, < max
-            let m = 1 << 63 | fbits << 11; // Mantissa and the implicit 1-bit.
-            let s = 1086 - (fbits >> 52); // Shift based on the exponent and bias.
-            let u = (m >> s) as i64; // Unsigned result.
-            if f.is_sign_negative() { -u } else { u }
-        } else if fbits <= 2047 << 52 { // >= max (incl. inf)
-            if f.is_sign_negative() { i64::MIN } else { i64::MAX }
-        } else { // NaN
-            0
-        }
+        float_to_signed_int(f)
     }
 
     #[win64_128bit_abi_hack]
     pub extern "C" fn __fixdfti(f: f64) -> i128 {
-        let fbits = f.to_bits() & !0 >> 1; // Remove sign bit.
-        if fbits < 1023 << 52 { // >= 0, < 1
-            0
-        } else if fbits < 1150 << 52 { // >= 1, < max
-            let m = 1 << 127 | (fbits as u128) << 75; // Mantissa and the implicit 1-bit.
-            let s = 1150 - (fbits >> 52); // Shift based on the exponent and bias.
-            let u = (m >> s) as i128; // Unsigned result.
-            if f.is_sign_negative() { -u } else { u }
-        } else if fbits <= 2047 << 52 { // >= max (incl. inf)
-            if f.is_sign_negative() { i128::MIN } else { i128::MAX }
-        } else { // NaN
-            0
-        }
+        float_to_signed_int(f)
+    }
+
+    #[ppc_alias = __fixkfsi]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __fixtfsi(f: f128) -> i32 {
+        float_to_signed_int(f)
+    }
+
+    #[ppc_alias = __fixkfdi]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __fixtfdi(f: f128) -> i64 {
+        float_to_signed_int(f)
+    }
+
+    #[ppc_alias = __fixkfti]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __fixtfti(f: f128) -> i128 {
+        float_to_signed_int(f)
     }
 }
diff --git a/src/float/div.rs b/src/float/div.rs
index c0aae34f..f125771a 100644
--- a/src/float/div.rs
+++ b/src/float/div.rs
@@ -1,61 +1,149 @@
-// The functions are complex with many branches, and explicit
-// `return`s makes it clear where function exit points are
-#![allow(clippy::needless_return)]
-
-use float::Float;
-use int::{CastInto, DInt, HInt, Int};
-
-fn div32<F: Float>(a: F, b: F) -> F
+//! Floating point division routines.
+//!
+//! This module documentation gives an overview of the method used. More documentation is inline.
+//!
+//! # Relevant notation
+//!
+//! - `m_a`: the mantissa of `a`, in base 2
+//! - `p_a`: the exponent of `a`, in base 2. I.e. `a = m_a * 2^p_a`
+//! - `uqN` (e.g. `uq1`): this refers to Q notation for fixed-point numbers. UQ1.31 is an unsigned
+//!   fixed-point number with 1 integral bit, and 31 decimal bits. A `uqN` variable of type `uM`
+//!   will have N bits of integer and M-N bits of fraction.
+//! - `hw`: half width, i.e. for `f64` this will be a `u32`.
+//! - `x` is the best estimate of `1/m_b`
+//!
+//! # Method Overview
+//!
+//! Division routines must solve for `a / b`, which is `res = m_a*2^p_a / m_b*2^p_b`. The basic
+//! process is as follows:
+//!
+//! - Rearange the exponent and significand to simplify the operations:
+//!   `res = (m_a / m_b) * 2^{p_a - p_b}`.
+//! - Check for early exits (infinity, zero, etc).
+//! - If `a` or `b` are subnormal, normalize by shifting the mantissa and adjusting the exponent.
+//! - Set the implicit bit so math is correct.
+//! - Shift mantissa significant digits (with implicit bit) fully left such that fixed-point UQ1
+//!   or UQ0 numbers can be used for mantissa math. These will have greater precision than the
+//!   actual mantissa, which is important for correct rounding.
+//! - Calculate the reciprocal of `m_b`, `x`.
+//! - Use the reciprocal to multiply rather than divide: `res = m_a * x_b * 2^{p_a - p_b}`.
+//! - Reapply rounding.
+//!
+//! # Reciprocal calculation
+//!
+//! Calculating the reciprocal is the most complicated part of this process. It uses the
+//! [Newton-Raphson method], which picks an initial estimation (of the reciprocal) and performs
+//! a number of iterations to increase its precision.
+//!
+//! In general, Newton's method takes the following form:
+//!
+//! ```text
+//! `x_n` is a guess or the result of a previous iteration. Increasing `n` converges to the
+//! desired result.
+//!
+//! The result approaches a zero of `f(x)` by applying a correction to the previous gues.
+//!
+//! x_{n+1} = x_n - f(x_n) / f'(x_n)
+//! ```
+//!
+//! Applying this to find the reciprocal:
+//!
+//! ```text
+//! 1 / x = b
+//!
+//! Rearrange so we can solve by finding a zero
+//! 0 = (1 / x) - b = f(x)
+//!
+//! f'(x) = -x^{-2}
+//!
+//! x_{n+1} = 2*x_n - b*x_n^2
+//! ```
+//!
+//! This is a process that can be repeated to calculate the reciprocal with enough precision to
+//! achieve a correctly rounded result for the overall division operation. The maximum required
+//! number of iterations is known since precision doubles with each iteration.
+//!
+//! # Half-width operations
+//!
+//! Calculating the reciprocal requires widening multiplication and performing arithmetic on the
+//! results, meaning that emulated integer arithmetic on `u128` (for `f64`) and `u256` (for `f128`)
+//! gets used instead of native math.
+//!
+//! To make this more efficient, all but the final operation can be computed using half-width
+//! integers. For example, rather than computing four iterations using 128-bit integers for `f64`,
+//! we can instead perform three iterations using native 64-bit integers and only one final
+//! iteration using the full 128 bits.
+//!
+//! This works because of precision doubling. Some leeway is allowed here because the fixed-point
+//! number has more bits than the final mantissa will.
+//!
+//! [Newton-Raphson method]: https://en.wikipedia.org/wiki/Newton%27s_method
+
+use super::HalfRep;
+use crate::float::Float;
+use crate::int::{CastFrom, CastInto, DInt, HInt, Int, MinInt};
+use core::mem::size_of;
+use core::ops;
+
+fn div<F: Float>(a: F, b: F) -> F
 where
-    u32: CastInto<F::Int>,
-    F::Int: CastInto<u32>,
-    i32: CastInto<F::Int>,
     F::Int: CastInto<i32>,
-    F::Int: HInt,
-    <F as Float>::Int: core::ops::Mul,
+    F::Int: From<HalfRep<F>>,
+    F::Int: From<u8>,
+    F::Int: HInt + DInt,
+    <F::Int as HInt>::D: ops::Shr<u32, Output = <F::Int as HInt>::D>,
+    F::Int: From<u32>,
+    u16: CastInto<F::Int>,
+    i32: CastInto<F::Int>,
+    u32: CastInto<F::Int>,
+    u128: CastInto<HalfRep<F>>,
 {
-    const NUMBER_OF_HALF_ITERATIONS: usize = 0;
-    const NUMBER_OF_FULL_ITERATIONS: usize = 3;
-    const USE_NATIVE_FULL_ITERATIONS: bool = true;
-
     let one = F::Int::ONE;
     let zero = F::Int::ZERO;
+    let one_hw = HalfRep::<F>::ONE;
+    let zero_hw = HalfRep::<F>::ZERO;
     let hw = F::BITS / 2;
-    let lo_mask = u32::MAX >> hw;
+    let lo_mask = F::Int::MAX >> hw;
 
     let significand_bits = F::SIGNIFICAND_BITS;
-    let max_exponent = F::EXPONENT_MAX;
+    // Saturated exponent, representing infinity
+    let exponent_sat: F::Int = F::EXPONENT_MAX.cast();
 
     let exponent_bias = F::EXPONENT_BIAS;
-
     let implicit_bit = F::IMPLICIT_BIT;
     let significand_mask = F::SIGNIFICAND_MASK;
-    let sign_bit = F::SIGN_MASK as F::Int;
+    let sign_bit = F::SIGN_MASK;
     let abs_mask = sign_bit - one;
     let exponent_mask = F::EXPONENT_MASK;
     let inf_rep = exponent_mask;
     let quiet_bit = implicit_bit >> 1;
     let qnan_rep = exponent_mask | quiet_bit;
+    let (mut half_iterations, full_iterations) = get_iterations::<F>();
+    let recip_precision = reciprocal_precision::<F>();
 
-    #[inline(always)]
-    fn negate_u32(a: u32) -> u32 {
-        (<i32>::wrapping_neg(a as i32)) as u32
+    if F::BITS == 128 {
+        // FIXME(tgross35): f128 seems to require one more half iteration than expected
+        half_iterations += 1;
     }
 
     let a_rep = a.repr();
     let b_rep = b.repr();
 
-    let a_exponent = (a_rep >> significand_bits) & max_exponent.cast();
-    let b_exponent = (b_rep >> significand_bits) & max_exponent.cast();
+    // Exponent numeric representationm not accounting for bias
+    let a_exponent = (a_rep >> significand_bits) & exponent_sat;
+    let b_exponent = (b_rep >> significand_bits) & exponent_sat;
     let quotient_sign = (a_rep ^ b_rep) & sign_bit;
 
     let mut a_significand = a_rep & significand_mask;
     let mut b_significand = b_rep & significand_mask;
-    let mut scale = 0;
+
+    // The exponent of our final result in its encoded form
+    let mut res_exponent: i32 =
+        i32::cast_from(a_exponent) - i32::cast_from(b_exponent) + (exponent_bias as i32);
 
     // Detect if a or b is zero, denormal, infinity, or NaN.
-    if a_exponent.wrapping_sub(one) >= (max_exponent - 1).cast()
-        || b_exponent.wrapping_sub(one) >= (max_exponent - 1).cast()
+    if a_exponent.wrapping_sub(one) >= (exponent_sat - one)
+        || b_exponent.wrapping_sub(one) >= (exponent_sat - one)
     {
         let a_abs = a_rep & abs_mask;
         let b_abs = b_rep & abs_mask;
@@ -64,6 +152,7 @@ where
         if a_abs > inf_rep {
             return F::from_repr(a_rep | quiet_bit);
         }
+
         // anything / NaN = qNaN
         if b_abs > inf_rep {
             return F::from_repr(b_rep | quiet_bit);
@@ -99,34 +188,31 @@ where
             return F::from_repr(inf_rep | quotient_sign);
         }
 
-        // one or both of a or b is denormal, the other (if applicable) is a
-        // normal number.  Renormalize one or both of a and b, and set scale to
-        // include the necessary exponent adjustment.
+        // a is denormal. Renormalize it and set the scale to include the necessary exponent
+        // adjustment.
         if a_abs < implicit_bit {
             let (exponent, significand) = F::normalize(a_significand);
-            scale += exponent;
+            res_exponent += exponent;
             a_significand = significand;
         }
 
+        // b is denormal. Renormalize it and set the scale to include the necessary exponent
+        // adjustment.
         if b_abs < implicit_bit {
             let (exponent, significand) = F::normalize(b_significand);
-            scale -= exponent;
+            res_exponent -= exponent;
             b_significand = significand;
         }
     }
 
-    // Set the implicit significand bit.  If we fell through from the
+    // Set the implicit significand bit. If we fell through from the
     // denormal path it was already set by normalize( ), but setting it twice
     // won't hurt anything.
     a_significand |= implicit_bit;
     b_significand |= implicit_bit;
 
-    let written_exponent: i32 = CastInto::<u32>::cast(
-        a_exponent
-            .wrapping_sub(b_exponent)
-            .wrapping_add(scale.cast()),
-    )
-    .wrapping_add(exponent_bias) as i32;
+    // Transform to a fixed-point representation by shifting the significand to the high bits. We
+    // know this is in the range [1.0, 2.0] since the implicit bit is set to 1 above.
     let b_uq1 = b_significand << (F::BITS - significand_bits - 1);
 
     // Align the significand of b as a UQ1.(n-1) fixed-point number in the range
@@ -136,7 +222,7 @@ where
     //   abs(x0(b) - 1/b) <= abs(x0(1) - 1/1) = 3/4 - 1/sqrt(2) = 0.04289...,
     // which is about 4.5 bits.
     // The initial approximation is between x0(1.0) = 0.9571... and x0(2.0) = 0.4571...
-
+    //
     // Then, refine the reciprocal estimate using a quadratically converging
     // Newton-Raphson iteration:
     //     x_{n+1} = x_n * (2 - x_n * b)
@@ -156,118 +242,116 @@ where
     // abs(E_n) <= abs(e_n) + (1/b_hw - 1/b)
     //           = abs(e_n) + (b - b_hw) / (b*b_hw)
     //          <= abs(e_n) + 2 * 2^-HW
-
+    //
     // rep_t-sized iterations may be slower than the corresponding half-width
     // variant depending on the handware and whether single/double/quad precision
     // is selected.
+    //
     // NB: Using half-width iterations increases computation errors due to
     // rounding, so error estimations have to be computed taking the selected
     // mode into account!
-
-    #[allow(clippy::absurd_extreme_comparisons)]
-    let mut x_uq0 = if NUMBER_OF_HALF_ITERATIONS > 0 {
+    let mut x_uq0 = if half_iterations > 0 {
         // Starting with (n-1) half-width iterations
-        let b_uq1_hw: u16 =
-            (CastInto::<u32>::cast(b_significand) >> (significand_bits + 1 - hw)) as u16;
+        let b_uq1_hw: HalfRep<F> = b_uq1.hi();
 
         // C is (3/4 + 1/sqrt(2)) - 1 truncated to W0 fractional bits as UQ0.HW
         // with W0 being either 16 or 32 and W0 <= HW.
         // That is, C is the aforementioned 3/4 + 1/sqrt(2) constant (from which
         // b/2 is subtracted to obtain x0) wrapped to [0, 1) range.
+        let c_hw = c_hw::<F>();
 
-        // HW is at least 32. Shifting into the highest bits if needed.
-        let c_hw = (0x7504_u32 as u16).wrapping_shl(hw.wrapping_sub(32));
+        // Check that the top bit is set, i.e. value is within `[1, 2)`.
+        debug_assert!(b_uq1_hw & one_hw << (HalfRep::<F>::BITS - 1) > zero_hw);
 
         // b >= 1, thus an upper bound for 3/4 + 1/sqrt(2) - b/2 is about 0.9572,
         // so x0 fits to UQ0.HW without wrapping.
-        let x_uq0_hw: u16 = {
-            let mut x_uq0_hw: u16 = c_hw.wrapping_sub(b_uq1_hw /* exact b_hw/2 as UQ0.HW */);
-            // An e_0 error is comprised of errors due to
-            // * x0 being an inherently imprecise first approximation of 1/b_hw
-            // * C_hw being some (irrational) number **truncated** to W0 bits
-            // Please note that e_0 is calculated against the infinitely precise
-            // reciprocal of b_hw (that is, **truncated** version of b).
-            //
-            // e_0 <= 3/4 - 1/sqrt(2) + 2^-W0
-
-            // By construction, 1 <= b < 2
-            // f(x)  = x * (2 - b*x) = 2*x - b*x^2
-            // f'(x) = 2 * (1 - b*x)
+        let mut x_uq0_hw: HalfRep<F> =
+            c_hw.wrapping_sub(b_uq1_hw /* exact b_hw/2 as UQ0.HW */);
+
+        // An e_0 error is comprised of errors due to
+        // * x0 being an inherently imprecise first approximation of 1/b_hw
+        // * C_hw being some (irrational) number **truncated** to W0 bits
+        // Please note that e_0 is calculated against the infinitely precise
+        // reciprocal of b_hw (that is, **truncated** version of b).
+        //
+        // e_0 <= 3/4 - 1/sqrt(2) + 2^-W0
+        //
+        // By construction, 1 <= b < 2
+        // f(x)  = x * (2 - b*x) = 2*x - b*x^2
+        // f'(x) = 2 * (1 - b*x)
+        //
+        // On the [0, 1] interval, f(0)   = 0,
+        // then it increses until  f(1/b) = 1 / b, maximum on (0, 1),
+        // then it decreses to     f(1)   = 2 - b
+        //
+        // Let g(x) = x - f(x) = b*x^2 - x.
+        // On (0, 1/b), g(x) < 0 <=> f(x) > x
+        // On (1/b, 1], g(x) > 0 <=> f(x) < x
+        //
+        // For half-width iterations, b_hw is used instead of b.
+        for _ in 0..half_iterations {
+            // corr_UQ1_hw can be **larger** than 2 - b_hw*x by at most 1*Ulp
+            // of corr_UQ1_hw.
+            // "0.0 - (...)" is equivalent to "2.0 - (...)" in UQ1.(HW-1).
+            // On the other hand, corr_UQ1_hw should not overflow from 2.0 to 0.0 provided
+            // no overflow occurred earlier: ((rep_t)x_UQ0_hw * b_UQ1_hw >> HW) is
+            // expected to be strictly positive because b_UQ1_hw has its highest bit set
+            // and x_UQ0_hw should be rather large (it converges to 1/2 < 1/b_hw <= 1).
             //
-            // On the [0, 1] interval, f(0)   = 0,
-            // then it increses until  f(1/b) = 1 / b, maximum on (0, 1),
-            // then it decreses to     f(1)   = 2 - b
+            // Now, we should multiply UQ0.HW and UQ1.(HW-1) numbers, naturally
+            // obtaining an UQ1.(HW-1) number and proving its highest bit could be
+            // considered to be 0 to be able to represent it in UQ0.HW.
+            // From the above analysis of f(x), if corr_UQ1_hw would be represented
+            // without any intermediate loss of precision (that is, in twice_rep_t)
+            // x_UQ0_hw could be at most [1.]000... if b_hw is exactly 1.0 and strictly
+            // less otherwise. On the other hand, to obtain [1.]000..., one have to pass
+            // 1/b_hw == 1.0 to f(x), so this cannot occur at all without overflow (due
+            // to 1.0 being not representable as UQ0.HW).
+            // The fact corr_UQ1_hw was virtually round up (due to result of
+            // multiplication being **first** truncated, then negated - to improve
+            // error estimations) can increase x_UQ0_hw by up to 2*Ulp of x_UQ0_hw.
             //
-            // Let g(x) = x - f(x) = b*x^2 - x.
-            // On (0, 1/b), g(x) < 0 <=> f(x) > x
-            // On (1/b, 1], g(x) > 0 <=> f(x) < x
+            // Now, either no overflow occurred or x_UQ0_hw is 0 or 1 in its half_rep_t
+            // representation. In the latter case, x_UQ0_hw will be either 0 or 1 after
+            // any number of iterations, so just subtract 2 from the reciprocal
+            // approximation after last iteration.
             //
-            // For half-width iterations, b_hw is used instead of b.
-            #[allow(clippy::reversed_empty_ranges)]
-            for _ in 0..NUMBER_OF_HALF_ITERATIONS {
-                // corr_UQ1_hw can be **larger** than 2 - b_hw*x by at most 1*Ulp
-                // of corr_UQ1_hw.
-                // "0.0 - (...)" is equivalent to "2.0 - (...)" in UQ1.(HW-1).
-                // On the other hand, corr_UQ1_hw should not overflow from 2.0 to 0.0 provided
-                // no overflow occurred earlier: ((rep_t)x_UQ0_hw * b_UQ1_hw >> HW) is
-                // expected to be strictly positive because b_UQ1_hw has its highest bit set
-                // and x_UQ0_hw should be rather large (it converges to 1/2 < 1/b_hw <= 1).
-                let corr_uq1_hw: u16 =
-                    0.wrapping_sub((x_uq0_hw as u32).wrapping_mul(b_uq1_hw.cast()) >> hw) as u16;
-
-                // Now, we should multiply UQ0.HW and UQ1.(HW-1) numbers, naturally
-                // obtaining an UQ1.(HW-1) number and proving its highest bit could be
-                // considered to be 0 to be able to represent it in UQ0.HW.
-                // From the above analysis of f(x), if corr_UQ1_hw would be represented
-                // without any intermediate loss of precision (that is, in twice_rep_t)
-                // x_UQ0_hw could be at most [1.]000... if b_hw is exactly 1.0 and strictly
-                // less otherwise. On the other hand, to obtain [1.]000..., one have to pass
-                // 1/b_hw == 1.0 to f(x), so this cannot occur at all without overflow (due
-                // to 1.0 being not representable as UQ0.HW).
-                // The fact corr_UQ1_hw was virtually round up (due to result of
-                // multiplication being **first** truncated, then negated - to improve
-                // error estimations) can increase x_UQ0_hw by up to 2*Ulp of x_UQ0_hw.
-                x_uq0_hw = ((x_uq0_hw as u32).wrapping_mul(corr_uq1_hw as u32) >> (hw - 1)) as u16;
-                // Now, either no overflow occurred or x_UQ0_hw is 0 or 1 in its half_rep_t
-                // representation. In the latter case, x_UQ0_hw will be either 0 or 1 after
-                // any number of iterations, so just subtract 2 from the reciprocal
-                // approximation after last iteration.
-
-                // In infinite precision, with 0 <= eps1, eps2 <= U = 2^-HW:
-                // corr_UQ1_hw = 2 - (1/b_hw + e_n) * b_hw + 2*eps1
-                //             = 1 - e_n * b_hw + 2*eps1
-                // x_UQ0_hw = (1/b_hw + e_n) * (1 - e_n*b_hw + 2*eps1) - eps2
-                //          = 1/b_hw - e_n + 2*eps1/b_hw + e_n - e_n^2*b_hw + 2*e_n*eps1 - eps2
-                //          = 1/b_hw + 2*eps1/b_hw - e_n^2*b_hw + 2*e_n*eps1 - eps2
-                // e_{n+1} = -e_n^2*b_hw + 2*eps1/b_hw + 2*e_n*eps1 - eps2
-                //         = 2*e_n*eps1 - (e_n^2*b_hw + eps2) + 2*eps1/b_hw
-                //                        \------ >0 -------/   \-- >0 ---/
-                // abs(e_{n+1}) <= 2*abs(e_n)*U + max(2*e_n^2 + U, 2 * U)
-            }
-            // For initial half-width iterations, U = 2^-HW
-            // Let  abs(e_n)     <= u_n * U,
-            // then abs(e_{n+1}) <= 2 * u_n * U^2 + max(2 * u_n^2 * U^2 + U, 2 * U)
-            // u_{n+1} <= 2 * u_n * U + max(2 * u_n^2 * U + 1, 2)
-
-            // Account for possible overflow (see above). For an overflow to occur for the
-            // first time, for "ideal" corr_UQ1_hw (that is, without intermediate
-            // truncation), the result of x_UQ0_hw * corr_UQ1_hw should be either maximum
-            // value representable in UQ0.HW or less by 1. This means that 1/b_hw have to
-            // be not below that value (see g(x) above), so it is safe to decrement just
-            // once after the final iteration. On the other hand, an effective value of
-            // divisor changes after this point (from b_hw to b), so adjust here.
-            x_uq0_hw.wrapping_sub(1_u16)
-        };
+            // In infinite precision, with 0 <= eps1, eps2 <= U = 2^-HW:
+            // corr_UQ1_hw = 2 - (1/b_hw + e_n) * b_hw + 2*eps1
+            //             = 1 - e_n * b_hw + 2*eps1
+            // x_UQ0_hw = (1/b_hw + e_n) * (1 - e_n*b_hw + 2*eps1) - eps2
+            //          = 1/b_hw - e_n + 2*eps1/b_hw + e_n - e_n^2*b_hw + 2*e_n*eps1 - eps2
+            //          = 1/b_hw + 2*eps1/b_hw - e_n^2*b_hw + 2*e_n*eps1 - eps2
+            // e_{n+1} = -e_n^2*b_hw + 2*eps1/b_hw + 2*e_n*eps1 - eps2
+            //         = 2*e_n*eps1 - (e_n^2*b_hw + eps2) + 2*eps1/b_hw
+            //                        \------ >0 -------/   \-- >0 ---/
+            // abs(e_{n+1}) <= 2*abs(e_n)*U + max(2*e_n^2 + U, 2 * U)
+            x_uq0_hw = next_guess(x_uq0_hw, b_uq1_hw);
+        }
+
+        // For initial half-width iterations, U = 2^-HW
+        // Let  abs(e_n)     <= u_n * U,
+        // then abs(e_{n+1}) <= 2 * u_n * U^2 + max(2 * u_n^2 * U^2 + U, 2 * U)
+        // u_{n+1} <= 2 * u_n * U + max(2 * u_n^2 * U + 1, 2)
+        //
+        // Account for possible overflow (see above). For an overflow to occur for the
+        // first time, for "ideal" corr_UQ1_hw (that is, without intermediate
+        // truncation), the result of x_UQ0_hw * corr_UQ1_hw should be either maximum
+        // value representable in UQ0.HW or less by 1. This means that 1/b_hw have to
+        // be not below that value (see g(x) above), so it is safe to decrement just
+        // once after the final iteration. On the other hand, an effective value of
+        // divisor changes after this point (from b_hw to b), so adjust here.
+        x_uq0_hw = x_uq0_hw.wrapping_sub(one_hw);
 
         // Error estimations for full-precision iterations are calculated just
         // as above, but with U := 2^-W and taking extra decrementing into account.
         // We need at least one such iteration.
-
+        //
         // Simulating operations on a twice_rep_t to perform a single final full-width
         // iteration. Using ad-hoc multiplication implementations to take advantage
         // of particular structure of operands.
+        let blo: F::Int = b_uq1 & lo_mask;
 
-        let blo: u32 = (CastInto::<u32>::cast(b_uq1)) & lo_mask;
         // x_UQ0 = x_UQ0_hw * 2^HW - 1
         // x_UQ0 * b_UQ1 = (x_UQ0_hw * 2^HW) * (b_UQ1_hw * 2^HW + blo) - b_UQ1
         //
@@ -276,19 +360,23 @@ where
         // +            [  x_UQ0_hw *  blo  ]
         // -                      [      b_UQ1       ]
         // = [      result       ][.... discarded ...]
-        let corr_uq1 = negate_u32(
-            (x_uq0_hw as u32) * (b_uq1_hw as u32) + (((x_uq0_hw as u32) * (blo)) >> hw) - 1,
-        ); // account for *possible* carry
-        let lo_corr = corr_uq1 & lo_mask;
-        let hi_corr = corr_uq1 >> hw;
+        let corr_uq1: F::Int = (F::Int::from(x_uq0_hw) * F::Int::from(b_uq1_hw)
+            + ((F::Int::from(x_uq0_hw) * blo) >> hw))
+            .wrapping_sub(one)
+            .wrapping_neg(); // account for *possible* carry
+
+        let lo_corr: F::Int = corr_uq1 & lo_mask;
+        let hi_corr: F::Int = corr_uq1 >> hw;
+
         // x_UQ0 * corr_UQ1 = (x_UQ0_hw * 2^HW) * (hi_corr * 2^HW + lo_corr) - corr_UQ1
-        let mut x_uq0: <F as Float>::Int = ((((x_uq0_hw as u32) * hi_corr) << 1)
-            .wrapping_add(((x_uq0_hw as u32) * lo_corr) >> (hw - 1))
-            .wrapping_sub(2))
-        .cast(); // 1 to account for the highest bit of corr_UQ1 can be 1
-                 // 1 to account for possible carry
-                 // Just like the case of half-width iterations but with possibility
-                 // of overflowing by one extra Ulp of x_UQ0.
+        let mut x_uq0: F::Int = ((F::Int::from(x_uq0_hw) * hi_corr) << 1)
+            .wrapping_add((F::Int::from(x_uq0_hw) * lo_corr) >> (hw - 1))
+            // 1 to account for the highest bit of corr_UQ1 can be 1
+            // 1 to account for possible carry
+            // Just like the case of half-width iterations but with possibility
+            // of overflowing by one extra Ulp of x_UQ0.
+            .wrapping_sub(F::Int::from(2u8));
+
         x_uq0 -= one;
         // ... and then traditional fixup by 2 should work
 
@@ -296,7 +384,7 @@ where
         // abs(E_{N-1}) <=   (u_{N-1} + 2 /* due to conversion e_n -> E_n */) * 2^-HW
         //                 + (2^-HW + 2^-W))
         // abs(E_{N-1}) <= (u_{N-1} + 3.01) * 2^-HW
-
+        //
         // Then like for the half-width iterations:
         // With 0 <= eps1, eps2 < 2^-W
         // E_N  = 4 * E_{N-1} * eps1 - (E_{N-1}^2 * b + 4 * eps2) + 4 * eps1 / b
@@ -304,91 +392,55 @@ where
         // abs(E_N) <= 2^-W * [ 4 * (u_{N-1} + 3.01) * 2^-HW + max(4 + 2 * (u_{N-1} + 3.01)^2, 8) ]
         x_uq0
     } else {
-        // C is (3/4 + 1/sqrt(2)) - 1 truncated to 32 fractional bits as UQ0.n
-        let c: <F as Float>::Int = (0x7504F333 << (F::BITS - 32)).cast();
-        let x_uq0: <F as Float>::Int = c.wrapping_sub(b_uq1);
-        // E_0 <= 3/4 - 1/sqrt(2) + 2 * 2^-32
-        x_uq0
-    };
+        // C is (3/4 + 1/sqrt(2)) - 1 truncated to 64 fractional bits as UQ0.n
+        let c: F::Int = F::Int::from(0x7504F333u32) << (F::BITS - 32);
+        let mut x_uq0: F::Int = c.wrapping_sub(b_uq1);
 
-    let mut x_uq0 = if USE_NATIVE_FULL_ITERATIONS {
-        for _ in 0..NUMBER_OF_FULL_ITERATIONS {
-            let corr_uq1: u32 = 0.wrapping_sub(
-                ((CastInto::<u32>::cast(x_uq0) as u64) * (CastInto::<u32>::cast(b_uq1) as u64))
-                    >> F::BITS,
-            ) as u32;
-            x_uq0 = ((((CastInto::<u32>::cast(x_uq0) as u64) * (corr_uq1 as u64)) >> (F::BITS - 1))
-                as u32)
-                .cast();
+        // E_0 <= 3/4 - 1/sqrt(2) + 2 * 2^-64
+        // x_uq0
+        for _ in 0..full_iterations {
+            x_uq0 = next_guess(x_uq0, b_uq1);
         }
-        x_uq0
-    } else {
-        // not using native full iterations
+
         x_uq0
     };
 
     // Finally, account for possible overflow, as explained above.
     x_uq0 = x_uq0.wrapping_sub(2.cast());
 
-    // u_n for different precisions (with N-1 half-width iterations):
-    // W0 is the precision of C
-    //   u_0 = (3/4 - 1/sqrt(2) + 2^-W0) * 2^HW
-
-    // Estimated with bc:
-    //   define half1(un) { return 2.0 * (un + un^2) / 2.0^hw + 1.0; }
-    //   define half2(un) { return 2.0 * un / 2.0^hw + 2.0; }
-    //   define full1(un) { return 4.0 * (un + 3.01) / 2.0^hw + 2.0 * (un + 3.01)^2 + 4.0; }
-    //   define full2(un) { return 4.0 * (un + 3.01) / 2.0^hw + 8.0; }
-
-    //             | f32 (0 + 3) | f32 (2 + 1)  | f64 (3 + 1)  | f128 (4 + 1)
-    // u_0         | < 184224974 | < 2812.1     | < 184224974  | < 791240234244348797
-    // u_1         | < 15804007  | < 242.7      | < 15804007   | < 67877681371350440
-    // u_2         | < 116308    | < 2.81       | < 116308     | < 499533100252317
-    // u_3         | < 7.31      |              | < 7.31       | < 27054456580
-    // u_4         |             |              |              | < 80.4
-    // Final (U_N) | same as u_3 | < 72         | < 218        | < 13920
-
-    // Add 2 to U_N due to final decrement.
-
-    let reciprocal_precision: <F as Float>::Int = 10.cast();
-
     // Suppose 1/b - P * 2^-W < x < 1/b + P * 2^-W
-    let x_uq0 = x_uq0 - reciprocal_precision;
+    x_uq0 -= recip_precision.cast();
+
     // Now 1/b - (2*P) * 2^-W < x < 1/b
     // FIXME Is x_UQ0 still >= 0.5?
 
-    let mut quotient: <F as Float>::Int = x_uq0.widen_mul(a_significand << 1).hi();
+    let mut quotient_uq1: F::Int = x_uq0.widen_mul(a_significand << 1).hi();
     // Now, a/b - 4*P * 2^-W < q < a/b for q=<quotient_UQ1:dummy> in UQ1.(SB+1+W).
 
     // quotient_UQ1 is in [0.5, 2.0) as UQ1.(SB+1),
     // adjust it to be in [1.0, 2.0) as UQ1.SB.
-    let (mut residual, written_exponent) = if quotient < (implicit_bit << 1) {
+    let mut residual_lo = if quotient_uq1 < (implicit_bit << 1) {
         // Highest bit is 0, so just reinterpret quotient_UQ1 as UQ1.SB,
         // effectively doubling its value as well as its error estimation.
-        let residual_lo = (a_significand << (significand_bits + 1)).wrapping_sub(
-            (CastInto::<u32>::cast(quotient).wrapping_mul(CastInto::<u32>::cast(b_significand)))
-                .cast(),
-        );
+        let residual_lo = (a_significand << (significand_bits + 1))
+            .wrapping_sub(quotient_uq1.wrapping_mul(b_significand));
+        res_exponent -= 1;
         a_significand <<= 1;
-        (residual_lo, written_exponent.wrapping_sub(1))
+        residual_lo
     } else {
         // Highest bit is 1 (the UQ1.(SB+1) value is in [1, 2)), convert it
         // to UQ1.SB by right shifting by 1. Least significant bit is omitted.
-        quotient >>= 1;
-        let residual_lo = (a_significand << significand_bits).wrapping_sub(
-            (CastInto::<u32>::cast(quotient).wrapping_mul(CastInto::<u32>::cast(b_significand)))
-                .cast(),
-        );
-        (residual_lo, written_exponent)
+        quotient_uq1 >>= 1;
+        (a_significand << significand_bits).wrapping_sub(quotient_uq1.wrapping_mul(b_significand))
     };
 
-    //drop mutability
-    let quotient = quotient;
+    // drop mutability
+    let quotient = quotient_uq1;
 
     // NB: residualLo is calculated above for the normal result case.
     //     It is re-computed on denormal path that is expected to be not so
     //     performance-sensitive.
-
+    //
     // Now, q cannot be greater than a/b and can differ by at most 8*P * 2^-W + 2^-SB
     // Each NextAfter() increments the floating point value by at least 2^-SB
     // (more, if exponent was incremented).
@@ -408,508 +460,169 @@ where
     // For f32 (2+1): 32 < 74 < 32 * 3, so two NextAfter() are required
     // For f64: 220 < 256 (OK)
     // For f128: 4096 * 3 < 13922 < 4096 * 5 (three NextAfter() are required)
-
+    //
     // If we have overflowed the exponent, return infinity
-    if written_exponent >= max_exponent as i32 {
+    if res_exponent >= i32::cast_from(exponent_sat) {
         return F::from_repr(inf_rep | quotient_sign);
     }
 
     // Now, quotient <= the correctly-rounded result
     // and may need taking NextAfter() up to 3 times (see error estimates above)
     // r = a - b * q
-    let abs_result = if written_exponent > 0 {
+    let mut abs_result = if res_exponent > 0 {
         let mut ret = quotient & significand_mask;
-        ret |= ((written_exponent as u32) << significand_bits).cast();
-        residual <<= 1;
+        ret |= F::Int::from(res_exponent as u32) << significand_bits;
+        residual_lo <<= 1;
         ret
     } else {
-        if (significand_bits as i32 + written_exponent) < 0 {
+        if ((significand_bits as i32) + res_exponent) < 0 {
             return F::from_repr(quotient_sign);
         }
-        let ret = quotient.wrapping_shr(negate_u32(CastInto::<u32>::cast(written_exponent)) + 1);
-        residual = (CastInto::<u32>::cast(
-            a_significand.wrapping_shl(
-                significand_bits.wrapping_add(CastInto::<u32>::cast(written_exponent)),
-            ),
-        )
-        .wrapping_sub(
-            (CastInto::<u32>::cast(ret).wrapping_mul(CastInto::<u32>::cast(b_significand))) << 1,
-        ))
-        .cast();
+
+        let ret = quotient.wrapping_shr(u32::cast_from(res_exponent.wrapping_neg()) + 1);
+        residual_lo = a_significand
+            .wrapping_shl(significand_bits.wrapping_add(CastInto::<u32>::cast(res_exponent)))
+            .wrapping_sub(ret.wrapping_mul(b_significand) << 1);
         ret
     };
-    // Round
-    let abs_result = {
-        residual += abs_result & one; // tie to even
-                                      // The above line conditionally turns the below LT comparison into LTE
-
-        if residual > b_significand {
-            abs_result + one
-        } else {
-            abs_result
-        }
-    };
-    F::from_repr(abs_result | quotient_sign)
-}
 
-fn div64<F: Float>(a: F, b: F) -> F
-where
-    u32: CastInto<F::Int>,
-    F::Int: CastInto<u32>,
-    i32: CastInto<F::Int>,
-    F::Int: CastInto<i32>,
-    u64: CastInto<F::Int>,
-    F::Int: CastInto<u64>,
-    i64: CastInto<F::Int>,
-    F::Int: CastInto<i64>,
-    F::Int: HInt,
-{
-    const NUMBER_OF_HALF_ITERATIONS: usize = 3;
-    const NUMBER_OF_FULL_ITERATIONS: usize = 1;
-    const USE_NATIVE_FULL_ITERATIONS: bool = false;
-
-    let one = F::Int::ONE;
-    let zero = F::Int::ZERO;
-    let hw = F::BITS / 2;
-    let lo_mask = u64::MAX >> hw;
-
-    let significand_bits = F::SIGNIFICAND_BITS;
-    let max_exponent = F::EXPONENT_MAX;
-
-    let exponent_bias = F::EXPONENT_BIAS;
+    residual_lo += abs_result & one; // tie to even
+                                     // conditionally turns the below LT comparison into LTE
+    abs_result += u8::from(residual_lo > b_significand).into();
 
-    let implicit_bit = F::IMPLICIT_BIT;
-    let significand_mask = F::SIGNIFICAND_MASK;
-    let sign_bit = F::SIGN_MASK as F::Int;
-    let abs_mask = sign_bit - one;
-    let exponent_mask = F::EXPONENT_MASK;
-    let inf_rep = exponent_mask;
-    let quiet_bit = implicit_bit >> 1;
-    let qnan_rep = exponent_mask | quiet_bit;
-
-    #[inline(always)]
-    fn negate_u64(a: u64) -> u64 {
-        (<i64>::wrapping_neg(a as i64)) as u64
+    if F::BITS == 128 || (F::BITS == 32 && half_iterations > 0) {
+        // Do not round Infinity to NaN
+        abs_result +=
+            u8::from(abs_result < inf_rep && residual_lo > (2 + 1).cast() * b_significand).into();
     }
 
-    let a_rep = a.repr();
-    let b_rep = b.repr();
-
-    let a_exponent = (a_rep >> significand_bits) & max_exponent.cast();
-    let b_exponent = (b_rep >> significand_bits) & max_exponent.cast();
-    let quotient_sign = (a_rep ^ b_rep) & sign_bit;
-
-    let mut a_significand = a_rep & significand_mask;
-    let mut b_significand = b_rep & significand_mask;
-    let mut scale = 0;
-
-    // Detect if a or b is zero, denormal, infinity, or NaN.
-    if a_exponent.wrapping_sub(one) >= (max_exponent - 1).cast()
-        || b_exponent.wrapping_sub(one) >= (max_exponent - 1).cast()
-    {
-        let a_abs = a_rep & abs_mask;
-        let b_abs = b_rep & abs_mask;
-
-        // NaN / anything = qNaN
-        if a_abs > inf_rep {
-            return F::from_repr(a_rep | quiet_bit);
-        }
-        // anything / NaN = qNaN
-        if b_abs > inf_rep {
-            return F::from_repr(b_rep | quiet_bit);
-        }
-
-        if a_abs == inf_rep {
-            if b_abs == inf_rep {
-                // infinity / infinity = NaN
-                return F::from_repr(qnan_rep);
-            } else {
-                // infinity / anything else = +/- infinity
-                return F::from_repr(a_abs | quotient_sign);
-            }
-        }
-
-        // anything else / infinity = +/- 0
-        if b_abs == inf_rep {
-            return F::from_repr(quotient_sign);
-        }
-
-        if a_abs == zero {
-            if b_abs == zero {
-                // zero / zero = NaN
-                return F::from_repr(qnan_rep);
-            } else {
-                // zero / anything else = +/- zero
-                return F::from_repr(quotient_sign);
-            }
-        }
-
-        // anything else / zero = +/- infinity
-        if b_abs == zero {
-            return F::from_repr(inf_rep | quotient_sign);
-        }
-
-        // one or both of a or b is denormal, the other (if applicable) is a
-        // normal number.  Renormalize one or both of a and b, and set scale to
-        // include the necessary exponent adjustment.
-        if a_abs < implicit_bit {
-            let (exponent, significand) = F::normalize(a_significand);
-            scale += exponent;
-            a_significand = significand;
-        }
-
-        if b_abs < implicit_bit {
-            let (exponent, significand) = F::normalize(b_significand);
-            scale -= exponent;
-            b_significand = significand;
-        }
+    if F::BITS == 128 {
+        abs_result +=
+            u8::from(abs_result < inf_rep && residual_lo > (4 + 1).cast() * b_significand).into();
     }
 
-    // Set the implicit significand bit.  If we fell through from the
-    // denormal path it was already set by normalize( ), but setting it twice
-    // won't hurt anything.
-    a_significand |= implicit_bit;
-    b_significand |= implicit_bit;
-
-    let written_exponent: i64 = CastInto::<u64>::cast(
-        a_exponent
-            .wrapping_sub(b_exponent)
-            .wrapping_add(scale.cast()),
-    )
-    .wrapping_add(exponent_bias as u64) as i64;
-    let b_uq1 = b_significand << (F::BITS - significand_bits - 1);
-
-    // Align the significand of b as a UQ1.(n-1) fixed-point number in the range
-    // [1.0, 2.0) and get a UQ0.n approximate reciprocal using a small minimax
-    // polynomial approximation: x0 = 3/4 + 1/sqrt(2) - b/2.
-    // The max error for this approximation is achieved at endpoints, so
-    //   abs(x0(b) - 1/b) <= abs(x0(1) - 1/1) = 3/4 - 1/sqrt(2) = 0.04289...,
-    // which is about 4.5 bits.
-    // The initial approximation is between x0(1.0) = 0.9571... and x0(2.0) = 0.4571...
-
-    // Then, refine the reciprocal estimate using a quadratically converging
-    // Newton-Raphson iteration:
-    //     x_{n+1} = x_n * (2 - x_n * b)
-    //
-    // Let b be the original divisor considered "in infinite precision" and
-    // obtained from IEEE754 representation of function argument (with the
-    // implicit bit set). Corresponds to rep_t-sized b_UQ1 represented in
-    // UQ1.(W-1).
-    //
-    // Let b_hw be an infinitely precise number obtained from the highest (HW-1)
-    // bits of divisor significand (with the implicit bit set). Corresponds to
-    // half_rep_t-sized b_UQ1_hw represented in UQ1.(HW-1) that is a **truncated**
-    // version of b_UQ1.
-    //
-    // Let e_n := x_n - 1/b_hw
-    //     E_n := x_n - 1/b
-    // abs(E_n) <= abs(e_n) + (1/b_hw - 1/b)
-    //           = abs(e_n) + (b - b_hw) / (b*b_hw)
-    //          <= abs(e_n) + 2 * 2^-HW
-
-    // rep_t-sized iterations may be slower than the corresponding half-width
-    // variant depending on the handware and whether single/double/quad precision
-    // is selected.
-    // NB: Using half-width iterations increases computation errors due to
-    // rounding, so error estimations have to be computed taking the selected
-    // mode into account!
-
-    let mut x_uq0 = if NUMBER_OF_HALF_ITERATIONS > 0 {
-        // Starting with (n-1) half-width iterations
-        let b_uq1_hw: u32 =
-            (CastInto::<u64>::cast(b_significand) >> (significand_bits + 1 - hw)) as u32;
-
-        // C is (3/4 + 1/sqrt(2)) - 1 truncated to W0 fractional bits as UQ0.HW
-        // with W0 being either 16 or 32 and W0 <= HW.
-        // That is, C is the aforementioned 3/4 + 1/sqrt(2) constant (from which
-        // b/2 is subtracted to obtain x0) wrapped to [0, 1) range.
-
-        // HW is at least 32. Shifting into the highest bits if needed.
-        let c_hw = (0x7504F333_u64 as u32).wrapping_shl(hw.wrapping_sub(32));
-
-        // b >= 1, thus an upper bound for 3/4 + 1/sqrt(2) - b/2 is about 0.9572,
-        // so x0 fits to UQ0.HW without wrapping.
-        let x_uq0_hw: u32 = {
-            let mut x_uq0_hw: u32 = c_hw.wrapping_sub(b_uq1_hw /* exact b_hw/2 as UQ0.HW */);
-            // dbg!(x_uq0_hw);
-            // An e_0 error is comprised of errors due to
-            // * x0 being an inherently imprecise first approximation of 1/b_hw
-            // * C_hw being some (irrational) number **truncated** to W0 bits
-            // Please note that e_0 is calculated against the infinitely precise
-            // reciprocal of b_hw (that is, **truncated** version of b).
-            //
-            // e_0 <= 3/4 - 1/sqrt(2) + 2^-W0
-
-            // By construction, 1 <= b < 2
-            // f(x)  = x * (2 - b*x) = 2*x - b*x^2
-            // f'(x) = 2 * (1 - b*x)
-            //
-            // On the [0, 1] interval, f(0)   = 0,
-            // then it increses until  f(1/b) = 1 / b, maximum on (0, 1),
-            // then it decreses to     f(1)   = 2 - b
-            //
-            // Let g(x) = x - f(x) = b*x^2 - x.
-            // On (0, 1/b), g(x) < 0 <=> f(x) > x
-            // On (1/b, 1], g(x) > 0 <=> f(x) < x
-            //
-            // For half-width iterations, b_hw is used instead of b.
-            for _ in 0..NUMBER_OF_HALF_ITERATIONS {
-                // corr_UQ1_hw can be **larger** than 2 - b_hw*x by at most 1*Ulp
-                // of corr_UQ1_hw.
-                // "0.0 - (...)" is equivalent to "2.0 - (...)" in UQ1.(HW-1).
-                // On the other hand, corr_UQ1_hw should not overflow from 2.0 to 0.0 provided
-                // no overflow occurred earlier: ((rep_t)x_UQ0_hw * b_UQ1_hw >> HW) is
-                // expected to be strictly positive because b_UQ1_hw has its highest bit set
-                // and x_UQ0_hw should be rather large (it converges to 1/2 < 1/b_hw <= 1).
-                let corr_uq1_hw: u32 =
-                    0.wrapping_sub(((x_uq0_hw as u64).wrapping_mul(b_uq1_hw as u64)) >> hw) as u32;
-                // dbg!(corr_uq1_hw);
-
-                // Now, we should multiply UQ0.HW and UQ1.(HW-1) numbers, naturally
-                // obtaining an UQ1.(HW-1) number and proving its highest bit could be
-                // considered to be 0 to be able to represent it in UQ0.HW.
-                // From the above analysis of f(x), if corr_UQ1_hw would be represented
-                // without any intermediate loss of precision (that is, in twice_rep_t)
-                // x_UQ0_hw could be at most [1.]000... if b_hw is exactly 1.0 and strictly
-                // less otherwise. On the other hand, to obtain [1.]000..., one have to pass
-                // 1/b_hw == 1.0 to f(x), so this cannot occur at all without overflow (due
-                // to 1.0 being not representable as UQ0.HW).
-                // The fact corr_UQ1_hw was virtually round up (due to result of
-                // multiplication being **first** truncated, then negated - to improve
-                // error estimations) can increase x_UQ0_hw by up to 2*Ulp of x_UQ0_hw.
-                x_uq0_hw = ((x_uq0_hw as u64).wrapping_mul(corr_uq1_hw as u64) >> (hw - 1)) as u32;
-                // dbg!(x_uq0_hw);
-                // Now, either no overflow occurred or x_UQ0_hw is 0 or 1 in its half_rep_t
-                // representation. In the latter case, x_UQ0_hw will be either 0 or 1 after
-                // any number of iterations, so just subtract 2 from the reciprocal
-                // approximation after last iteration.
-
-                // In infinite precision, with 0 <= eps1, eps2 <= U = 2^-HW:
-                // corr_UQ1_hw = 2 - (1/b_hw + e_n) * b_hw + 2*eps1
-                //             = 1 - e_n * b_hw + 2*eps1
-                // x_UQ0_hw = (1/b_hw + e_n) * (1 - e_n*b_hw + 2*eps1) - eps2
-                //          = 1/b_hw - e_n + 2*eps1/b_hw + e_n - e_n^2*b_hw + 2*e_n*eps1 - eps2
-                //          = 1/b_hw + 2*eps1/b_hw - e_n^2*b_hw + 2*e_n*eps1 - eps2
-                // e_{n+1} = -e_n^2*b_hw + 2*eps1/b_hw + 2*e_n*eps1 - eps2
-                //         = 2*e_n*eps1 - (e_n^2*b_hw + eps2) + 2*eps1/b_hw
-                //                        \------ >0 -------/   \-- >0 ---/
-                // abs(e_{n+1}) <= 2*abs(e_n)*U + max(2*e_n^2 + U, 2 * U)
-            }
-            // For initial half-width iterations, U = 2^-HW
-            // Let  abs(e_n)     <= u_n * U,
-            // then abs(e_{n+1}) <= 2 * u_n * U^2 + max(2 * u_n^2 * U^2 + U, 2 * U)
-            // u_{n+1} <= 2 * u_n * U + max(2 * u_n^2 * U + 1, 2)
-
-            // Account for possible overflow (see above). For an overflow to occur for the
-            // first time, for "ideal" corr_UQ1_hw (that is, without intermediate
-            // truncation), the result of x_UQ0_hw * corr_UQ1_hw should be either maximum
-            // value representable in UQ0.HW or less by 1. This means that 1/b_hw have to
-            // be not below that value (see g(x) above), so it is safe to decrement just
-            // once after the final iteration. On the other hand, an effective value of
-            // divisor changes after this point (from b_hw to b), so adjust here.
-            x_uq0_hw.wrapping_sub(1_u32)
-        };
-
-        // Error estimations for full-precision iterations are calculated just
-        // as above, but with U := 2^-W and taking extra decrementing into account.
-        // We need at least one such iteration.
-
-        // Simulating operations on a twice_rep_t to perform a single final full-width
-        // iteration. Using ad-hoc multiplication implementations to take advantage
-        // of particular structure of operands.
-        let blo: u64 = (CastInto::<u64>::cast(b_uq1)) & lo_mask;
-        // x_UQ0 = x_UQ0_hw * 2^HW - 1
-        // x_UQ0 * b_UQ1 = (x_UQ0_hw * 2^HW) * (b_UQ1_hw * 2^HW + blo) - b_UQ1
-        //
-        //   <--- higher half ---><--- lower half --->
-        //   [x_UQ0_hw * b_UQ1_hw]
-        // +            [  x_UQ0_hw *  blo  ]
-        // -                      [      b_UQ1       ]
-        // = [      result       ][.... discarded ...]
-        let corr_uq1 = negate_u64(
-            (x_uq0_hw as u64) * (b_uq1_hw as u64) + (((x_uq0_hw as u64) * (blo)) >> hw) - 1,
-        ); // account for *possible* carry
-        let lo_corr = corr_uq1 & lo_mask;
-        let hi_corr = corr_uq1 >> hw;
-        // x_UQ0 * corr_UQ1 = (x_UQ0_hw * 2^HW) * (hi_corr * 2^HW + lo_corr) - corr_UQ1
-        let mut x_uq0: <F as Float>::Int = ((((x_uq0_hw as u64) * hi_corr) << 1)
-            .wrapping_add(((x_uq0_hw as u64) * lo_corr) >> (hw - 1))
-            .wrapping_sub(2))
-        .cast(); // 1 to account for the highest bit of corr_UQ1 can be 1
-                 // 1 to account for possible carry
-                 // Just like the case of half-width iterations but with possibility
-                 // of overflowing by one extra Ulp of x_UQ0.
-        x_uq0 -= one;
-        // ... and then traditional fixup by 2 should work
-
-        // On error estimation:
-        // abs(E_{N-1}) <=   (u_{N-1} + 2 /* due to conversion e_n -> E_n */) * 2^-HW
-        //                 + (2^-HW + 2^-W))
-        // abs(E_{N-1}) <= (u_{N-1} + 3.01) * 2^-HW
-
-        // Then like for the half-width iterations:
-        // With 0 <= eps1, eps2 < 2^-W
-        // E_N  = 4 * E_{N-1} * eps1 - (E_{N-1}^2 * b + 4 * eps2) + 4 * eps1 / b
-        // abs(E_N) <= 2^-W * [ 4 * abs(E_{N-1}) + max(2 * abs(E_{N-1})^2 * 2^W + 4, 8)) ]
-        // abs(E_N) <= 2^-W * [ 4 * (u_{N-1} + 3.01) * 2^-HW + max(4 + 2 * (u_{N-1} + 3.01)^2, 8) ]
-        x_uq0
-    } else {
-        // C is (3/4 + 1/sqrt(2)) - 1 truncated to 64 fractional bits as UQ0.n
-        let c: <F as Float>::Int = (0x7504F333 << (F::BITS - 32)).cast();
-        let x_uq0: <F as Float>::Int = c.wrapping_sub(b_uq1);
-        // E_0 <= 3/4 - 1/sqrt(2) + 2 * 2^-64
-        x_uq0
-    };
+    F::from_repr(abs_result | quotient_sign)
+}
 
-    let mut x_uq0 = if USE_NATIVE_FULL_ITERATIONS {
-        for _ in 0..NUMBER_OF_FULL_ITERATIONS {
-            let corr_uq1: u64 = 0.wrapping_sub(
-                (CastInto::<u64>::cast(x_uq0) * (CastInto::<u64>::cast(b_uq1))) >> F::BITS,
-            );
-            x_uq0 = ((((CastInto::<u64>::cast(x_uq0) as u128) * (corr_uq1 as u128))
-                >> (F::BITS - 1)) as u64)
-                .cast();
-        }
-        x_uq0
+/// Calculate the number of iterations required for a float type's precision.
+///
+/// This returns `(h, f)` where `h` is the number of iterations to be done using integers at half
+/// the float's bit width, and `f` is the number of iterations done using integers of the float's
+/// full width. This is further explained in the module documentation.
+///
+/// # Requirements
+///
+/// The initial estimate should have at least 8 bits of precision. If this is not true, results
+/// will be inaccurate.
+const fn get_iterations<F: Float>() -> (usize, usize) {
+    // Precision doubles with each iteration. Assume we start with 8 bits of precision.
+    let total_iterations = F::BITS.ilog2() as usize - 2;
+
+    if 2 * size_of::<F>() <= size_of::<*const ()>() {
+        // If widening multiplication will be efficient (uses word-sized integers), there is no
+        // reason to use half-sized iterations.
+        (0, total_iterations)
     } else {
-        // not using native full iterations
-        x_uq0
-    };
-
-    // Finally, account for possible overflow, as explained above.
-    x_uq0 = x_uq0.wrapping_sub(2.cast());
-
-    // u_n for different precisions (with N-1 half-width iterations):
-    // W0 is the precision of C
-    //   u_0 = (3/4 - 1/sqrt(2) + 2^-W0) * 2^HW
-
-    // Estimated with bc:
-    //   define half1(un) { return 2.0 * (un + un^2) / 2.0^hw + 1.0; }
-    //   define half2(un) { return 2.0 * un / 2.0^hw + 2.0; }
-    //   define full1(un) { return 4.0 * (un + 3.01) / 2.0^hw + 2.0 * (un + 3.01)^2 + 4.0; }
-    //   define full2(un) { return 4.0 * (un + 3.01) / 2.0^hw + 8.0; }
-
-    //             | f32 (0 + 3) | f32 (2 + 1)  | f64 (3 + 1)  | f128 (4 + 1)
-    // u_0         | < 184224974 | < 2812.1     | < 184224974  | < 791240234244348797
-    // u_1         | < 15804007  | < 242.7      | < 15804007   | < 67877681371350440
-    // u_2         | < 116308    | < 2.81       | < 116308     | < 499533100252317
-    // u_3         | < 7.31      |              | < 7.31       | < 27054456580
-    // u_4         |             |              |              | < 80.4
-    // Final (U_N) | same as u_3 | < 72         | < 218        | < 13920
-
-    // Add 2 to U_N due to final decrement.
-
-    let reciprocal_precision: <F as Float>::Int = 220.cast();
-
-    // Suppose 1/b - P * 2^-W < x < 1/b + P * 2^-W
-    let x_uq0 = x_uq0 - reciprocal_precision;
-    // Now 1/b - (2*P) * 2^-W < x < 1/b
-    // FIXME Is x_UQ0 still >= 0.5?
+        // Otherwise, do as many iterations as possible at half width.
+        (total_iterations - 1, 1)
+    }
+}
 
-    let mut quotient: <F as Float>::Int = x_uq0.widen_mul(a_significand << 1).hi();
-    // Now, a/b - 4*P * 2^-W < q < a/b for q=<quotient_UQ1:dummy> in UQ1.(SB+1+W).
+/// `u_n` for different precisions (with N-1 half-width iterations).
+///
+/// W0 is the precision of C
+///   u_0 = (3/4 - 1/sqrt(2) + 2^-W0) * 2^HW
+///
+/// Estimated with bc:
+///
+/// ```text
+///   define half1(un) { return 2.0 * (un + un^2) / 2.0^hw + 1.0; }
+///   define half2(un) { return 2.0 * un / 2.0^hw + 2.0; }
+///   define full1(un) { return 4.0 * (un + 3.01) / 2.0^hw + 2.0 * (un + 3.01)^2 + 4.0; }
+///   define full2(un) { return 4.0 * (un + 3.01) / 2.0^hw + 8.0; }
+///
+///             | f32 (0 + 3) | f32 (2 + 1)  | f64 (3 + 1)  | f128 (4 + 1)
+/// u_0         | < 184224974 | < 2812.1     | < 184224974  | < 791240234244348797
+/// u_1         | < 15804007  | < 242.7      | < 15804007   | < 67877681371350440
+/// u_2         | < 116308    | < 2.81       | < 116308     | < 499533100252317
+/// u_3         | < 7.31      |              | < 7.31       | < 27054456580
+/// u_4         |             |              |              | < 80.4
+/// Final (U_N) | same as u_3 | < 72         | < 218        | < 13920
+/// ````
+///
+/// Add 2 to `U_N` due to final decrement.
+const fn reciprocal_precision<F: Float>() -> u16 {
+    let (half_iterations, full_iterations) = get_iterations::<F>();
+
+    if full_iterations < 1 {
+        panic!("Must have at least one full iteration");
+    }
 
-    // quotient_UQ1 is in [0.5, 2.0) as UQ1.(SB+1),
-    // adjust it to be in [1.0, 2.0) as UQ1.SB.
-    let (mut residual, written_exponent) = if quotient < (implicit_bit << 1) {
-        // Highest bit is 0, so just reinterpret quotient_UQ1 as UQ1.SB,
-        // effectively doubling its value as well as its error estimation.
-        let residual_lo = (a_significand << (significand_bits + 1)).wrapping_sub(
-            (CastInto::<u64>::cast(quotient).wrapping_mul(CastInto::<u64>::cast(b_significand)))
-                .cast(),
-        );
-        a_significand <<= 1;
-        (residual_lo, written_exponent.wrapping_sub(1))
+    // FIXME(tgross35): calculate this programmatically
+    if F::BITS == 32 && half_iterations == 2 && full_iterations == 1 {
+        74u16
+    } else if F::BITS == 32 && half_iterations == 0 && full_iterations == 3 {
+        10
+    } else if F::BITS == 64 && half_iterations == 3 && full_iterations == 1 {
+        220
+    } else if F::BITS == 128 && half_iterations == 4 && full_iterations == 1 {
+        13922
     } else {
-        // Highest bit is 1 (the UQ1.(SB+1) value is in [1, 2)), convert it
-        // to UQ1.SB by right shifting by 1. Least significant bit is omitted.
-        quotient >>= 1;
-        let residual_lo = (a_significand << significand_bits).wrapping_sub(
-            (CastInto::<u64>::cast(quotient).wrapping_mul(CastInto::<u64>::cast(b_significand)))
-                .cast(),
-        );
-        (residual_lo, written_exponent)
-    };
-
-    //drop mutability
-    let quotient = quotient;
-
-    // NB: residualLo is calculated above for the normal result case.
-    //     It is re-computed on denormal path that is expected to be not so
-    //     performance-sensitive.
+        panic!("Invalid number of iterations")
+    }
+}
 
-    // Now, q cannot be greater than a/b and can differ by at most 8*P * 2^-W + 2^-SB
-    // Each NextAfter() increments the floating point value by at least 2^-SB
-    // (more, if exponent was incremented).
-    // Different cases (<---> is of 2^-SB length, * = a/b that is shown as a midpoint):
-    //   q
-    //   |   | * |   |   |       |       |
-    //       <--->      2^t
-    //   |   |   |   |   |   *   |       |
-    //               q
-    // To require at most one NextAfter(), an error should be less than 1.5 * 2^-SB.
-    //   (8*P) * 2^-W + 2^-SB < 1.5 * 2^-SB
-    //   (8*P) * 2^-W         < 0.5 * 2^-SB
-    //   P < 2^(W-4-SB)
-    // Generally, for at most R NextAfter() to be enough,
-    //   P < (2*R - 1) * 2^(W-4-SB)
-    // For f32 (0+3): 10 < 32 (OK)
-    // For f32 (2+1): 32 < 74 < 32 * 3, so two NextAfter() are required
-    // For f64: 220 < 256 (OK)
-    // For f128: 4096 * 3 < 13922 < 4096 * 5 (three NextAfter() are required)
+/// The value of `C` adjusted to half width.
+///
+/// C is (3/4 + 1/sqrt(2)) - 1 truncated to W0 fractional bits as UQ0.HW with W0 being either
+/// 16 or 32 and W0 <= HW. That is, C is the aforementioned 3/4 + 1/sqrt(2) constant (from
+/// which b/2 is subtracted to obtain x0) wrapped to [0, 1) range.
+fn c_hw<F: Float>() -> HalfRep<F>
+where
+    F::Int: DInt,
+    u128: CastInto<HalfRep<F>>,
+{
+    const C_U128: u128 = 0x7504f333f9de6108b2fb1366eaa6a542;
+    const { C_U128 >> (u128::BITS - <HalfRep<F>>::BITS) }.cast()
+}
 
-    // If we have overflowed the exponent, return infinity
-    if written_exponent >= max_exponent as i64 {
-        return F::from_repr(inf_rep | quotient_sign);
-    }
+/// Perform one iteration at any width to approach `1/b`, given previous guess `x`. Returns
+/// the next `x` as a UQ0 number.
+///
+/// This is the `x_{n+1} = 2*x_n - b*x_n^2` algorithm, implemented as `x_n * (2 - b*x_n)`. It
+/// uses widening multiplication to calculate the result with necessary precision.
+fn next_guess<I>(x_uq0: I, b_uq1: I) -> I
+where
+    I: Int + HInt,
+    <I as HInt>::D: ops::Shr<u32, Output = <I as HInt>::D>,
+{
+    // `corr = 2 - b*x_n`
+    //
+    // This looks like `0 - b*x_n`. However, this works - in `UQ1`, `0.0 - x = 2.0 - x`.
+    let corr_uq1: I = I::ZERO.wrapping_sub(x_uq0.widen_mul(b_uq1).hi());
 
-    // Now, quotient <= the correctly-rounded result
-    // and may need taking NextAfter() up to 3 times (see error estimates above)
-    // r = a - b * q
-    let abs_result = if written_exponent > 0 {
-        let mut ret = quotient & significand_mask;
-        ret |= ((written_exponent as u64) << significand_bits).cast();
-        residual <<= 1;
-        ret
-    } else {
-        if (significand_bits as i64 + written_exponent) < 0 {
-            return F::from_repr(quotient_sign);
-        }
-        let ret =
-            quotient.wrapping_shr((negate_u64(CastInto::<u64>::cast(written_exponent)) + 1) as u32);
-        residual = (CastInto::<u64>::cast(
-            a_significand.wrapping_shl(
-                significand_bits.wrapping_add(CastInto::<u32>::cast(written_exponent)),
-            ),
-        )
-        .wrapping_sub(
-            (CastInto::<u64>::cast(ret).wrapping_mul(CastInto::<u64>::cast(b_significand))) << 1,
-        ))
-        .cast();
-        ret
-    };
-    // Round
-    let abs_result = {
-        residual += abs_result & one; // tie to even
-                                      // conditionally turns the below LT comparison into LTE
-        if residual > b_significand {
-            abs_result + one
-        } else {
-            abs_result
-        }
-    };
-    F::from_repr(abs_result | quotient_sign)
+    // `x_n * corr = x_n * (2 - b*x_n)`
+    (x_uq0.widen_mul(corr_uq1) >> (I::BITS - 1)).lo()
 }
 
 intrinsics! {
+    #[avr_skip]
     #[arm_aeabi_alias = __aeabi_fdiv]
     pub extern "C" fn __divsf3(a: f32, b: f32) -> f32 {
-        div32(a, b)
+        div(a, b)
     }
 
+    #[avr_skip]
     #[arm_aeabi_alias = __aeabi_ddiv]
     pub extern "C" fn __divdf3(a: f64, b: f64) -> f64 {
-        div64(a, b)
+        div(a, b)
+    }
+
+    #[avr_skip]
+    #[ppc_alias = __divkf3]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __divtf3(a: f128, b: f128) -> f128 {
+        div(a, b)
     }
 
     #[cfg(target_arch = "arm")]
diff --git a/src/float/extend.rs b/src/float/extend.rs
index 39633773..997475c8 100644
--- a/src/float/extend.rs
+++ b/src/float/extend.rs
@@ -1,5 +1,5 @@
-use float::Float;
-use int::{CastInto, Int};
+use crate::float::Float;
+use crate::int::{CastInto, Int, MinInt};
 
 /// Generic conversion from a narrower to a wider IEEE-754 floating-point type
 fn extend<F: Float, R: Float>(a: F) -> R
@@ -70,14 +70,53 @@ where
 }
 
 intrinsics! {
+    #[avr_skip]
     #[aapcs_on_arm]
     #[arm_aeabi_alias = __aeabi_f2d]
     pub extern "C" fn  __extendsfdf2(a: f32) -> f64 {
         extend(a)
     }
+}
+
+intrinsics! {
+    #[avr_skip]
+    #[aapcs_on_arm]
+    #[apple_f16_arg_abi]
+    #[arm_aeabi_alias = __aeabi_h2f]
+    #[cfg(f16_enabled)]
+    pub extern "C" fn __extendhfsf2(a: f16) -> f32 {
+        extend(a)
+    }
+
+    #[avr_skip]
+    #[aapcs_on_arm]
+    #[apple_f16_arg_abi]
+    #[cfg(f16_enabled)]
+    pub extern "C" fn __gnu_h2f_ieee(a: f16) -> f32 {
+        extend(a)
+    }
 
-    #[cfg(target_arch = "arm")]
-    pub extern "C" fn  __extendsfdf2vfp(a: f32) -> f64 {
-        a as f64 // LLVM generate 'fcvtds'
+    #[avr_skip]
+    #[aapcs_on_arm]
+    #[ppc_alias = __extendhfkf2]
+    #[cfg(all(f16_enabled, f128_enabled))]
+    pub extern "C" fn __extendhftf2(a: f16) -> f128 {
+        extend(a)
+    }
+
+    #[avr_skip]
+    #[aapcs_on_arm]
+    #[ppc_alias = __extendsfkf2]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __extendsftf2(a: f32) -> f128 {
+        extend(a)
+    }
+
+    #[avr_skip]
+    #[aapcs_on_arm]
+    #[ppc_alias = __extenddfkf2]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __extenddftf2(a: f64) -> f128 {
+        extend(a)
     }
 }
diff --git a/src/float/mod.rs b/src/float/mod.rs
index fdbe9dde..704bba0c 100644
--- a/src/float/mod.rs
+++ b/src/float/mod.rs
@@ -1,6 +1,6 @@
 use core::ops;
 
-use super::int::Int;
+use crate::int::{DInt, Int, MinInt};
 
 pub mod add;
 pub mod cmp;
@@ -12,8 +12,12 @@ pub mod pow;
 pub mod sub;
 pub mod trunc;
 
+/// Wrapper to extract the integer type half of the float's size
+pub(crate) type HalfRep<F> = <<F as Float>::Int as DInt>::H;
+
 public_test_dep! {
 /// Trait for some basic operations on floats
+#[allow(dead_code)]
 pub(crate) trait Float:
     Copy
     + core::fmt::Debug
@@ -27,10 +31,10 @@ pub(crate) trait Float:
     + ops::Rem<Output = Self>
 {
     /// A uint of the same width as the float
-    type Int: Int;
+    type Int: Int<OtherSign = Self::SignedInt, UnsignedInt = Self::Int>;
 
     /// A int of the same width as the float
-    type SignedInt: Int;
+    type SignedInt: Int + MinInt<OtherSign = Self::Int, UnsignedInt = Self::Int>;
 
     /// An int capable of containing the exponent bits plus a sign bit. This is signed.
     type ExpInt: Int;
@@ -47,7 +51,7 @@ pub(crate) trait Float:
     /// The bitwidth of the exponent
     const EXPONENT_BITS: u32 = Self::BITS - Self::SIGNIFICAND_BITS - 1;
 
-    /// The maximum value of the exponent
+    /// The saturated value of the exponent (infinite representation), in the rightmost postiion.
     const EXPONENT_MAX: u32 = (1 << Self::EXPONENT_BITS) - 1;
 
     /// The exponent bias value
@@ -59,7 +63,7 @@ pub(crate) trait Float:
     /// A mask for the significand
     const SIGNIFICAND_MASK: Self::Int;
 
-    // The implicit bit of the float format
+    /// The implicit bit of the float format
     const IMPLICIT_BIT: Self::Int;
 
     /// A mask for the exponent
@@ -76,10 +80,10 @@ pub(crate) trait Float:
     /// compared.
     fn eq_repr(self, rhs: Self) -> bool;
 
-    /// Returns the sign bit
-    fn sign(self) -> bool;
+    /// Returns true if the sign is negative
+    fn is_sign_negative(self) -> bool;
 
-    /// Returns the exponent with bias
+    /// Returns the exponent, not adjusting for bias.
     fn exp(self) -> Self::ExpInt;
 
     /// Returns the significand with no implicit bit (or the "fractional" part)
@@ -127,14 +131,27 @@ macro_rules! float_impl {
                 self.to_bits() as Self::SignedInt
             }
             fn eq_repr(self, rhs: Self) -> bool {
-                if self.is_nan() && rhs.is_nan() {
+                #[cfg(feature = "mangled-names")]
+                fn is_nan(x: $ty) -> bool {
+                    // When using mangled-names, the "real" compiler-builtins might not have the
+                    // necessary builtin (__unordtf2) to test whether `f128` is NaN.
+                    // FIXME(f16_f128): Remove once the nightly toolchain has the __unordtf2 builtin
+                    // x is NaN if all the bits of the exponent are set and the significand is non-0
+                    x.repr() & $ty::EXPONENT_MASK == $ty::EXPONENT_MASK
+                        && x.repr() & $ty::SIGNIFICAND_MASK != 0
+                }
+                #[cfg(not(feature = "mangled-names"))]
+                fn is_nan(x: $ty) -> bool {
+                    x.is_nan()
+                }
+                if is_nan(self) && is_nan(rhs) {
                     true
                 } else {
                     self.repr() == rhs.repr()
                 }
             }
-            fn sign(self) -> bool {
-                self.signed_repr() < Self::SignedInt::ZERO
+            fn is_sign_negative(self) -> bool {
+                self.is_sign_negative()
             }
             fn exp(self) -> Self::ExpInt {
                 ((self.to_bits() & Self::EXPONENT_MASK) >> Self::SIGNIFICAND_BITS) as Self::ExpInt
@@ -158,7 +175,7 @@ macro_rules! float_impl {
             fn normalize(significand: Self::Int) -> (i32, Self::Int) {
                 let shift = significand
                     .leading_zeros()
-                    .wrapping_sub((Self::Int::ONE << Self::SIGNIFICAND_BITS).leading_zeros());
+                    .wrapping_sub(Self::EXPONENT_BITS);
                 (
                     1i32.wrapping_sub(shift as i32),
                     significand << shift as Self::Int,
@@ -171,5 +188,9 @@ macro_rules! float_impl {
     };
 }
 
+#[cfg(f16_enabled)]
+float_impl!(f16, u16, i16, i8, 16, 10);
 float_impl!(f32, u32, i32, i16, 32, 23);
 float_impl!(f64, u64, i64, i16, 64, 52);
+#[cfg(f128_enabled)]
+float_impl!(f128, u128, i128, i16, 128, 112);
diff --git a/src/float/mul.rs b/src/float/mul.rs
index c89f2275..a4c69ea8 100644
--- a/src/float/mul.rs
+++ b/src/float/mul.rs
@@ -1,5 +1,5 @@
-use float::Float;
-use int::{CastInto, DInt, HInt, Int};
+use crate::float::Float;
+use crate::int::{CastInto, DInt, HInt, Int, MinInt};
 
 fn mul<F: Float>(a: F, b: F) -> F
 where
@@ -149,18 +149,13 @@ where
         }
 
         // Otherwise, shift the significand of the result so that the round
-        // bit is the high bit of productLo.
-        if shift < bits {
-            let sticky = product_low << (bits - shift);
-            product_low = product_high << (bits - shift) | product_low >> shift | sticky;
-            product_high >>= shift;
-        } else if shift < (2 * bits) {
-            let sticky = product_high << (2 * bits - shift) | product_low;
-            product_low = product_high >> (shift - bits) | sticky;
-            product_high = zero;
-        } else {
-            product_high = zero;
-        }
+        // bit is the high bit of `product_low`.
+        // Ensure one of the non-highest bits in `product_low` is set if the shifted out bit are
+        // not all zero so that the result is correctly rounded below.
+        let sticky = product_low << (bits - shift) != zero;
+        product_low =
+            product_high << (bits - shift) | product_low >> shift | (sticky as u32).cast();
+        product_high >>= shift;
     } else {
         // Result is normal before rounding; insert the exponent.
         product_high &= significand_mask;
@@ -185,25 +180,23 @@ where
 }
 
 intrinsics! {
+    #[avr_skip]
     #[aapcs_on_arm]
     #[arm_aeabi_alias = __aeabi_fmul]
     pub extern "C" fn __mulsf3(a: f32, b: f32) -> f32 {
         mul(a, b)
     }
 
+    #[avr_skip]
     #[aapcs_on_arm]
     #[arm_aeabi_alias = __aeabi_dmul]
     pub extern "C" fn __muldf3(a: f64, b: f64) -> f64 {
         mul(a, b)
     }
 
-    #[cfg(target_arch = "arm")]
-    pub extern "C" fn __mulsf3vfp(a: f32, b: f32) -> f32 {
-        a * b
-    }
-
-    #[cfg(target_arch = "arm")]
-    pub extern "C" fn __muldf3vfp(a: f64, b: f64) -> f64 {
-        a * b
+    #[ppc_alias = __mulkf3]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __multf3(a: f128, b: f128) -> f128 {
+        mul(a, b)
     }
 }
diff --git a/src/float/pow.rs b/src/float/pow.rs
index a75340c3..dac768f7 100644
--- a/src/float/pow.rs
+++ b/src/float/pow.rs
@@ -1,5 +1,5 @@
-use float::Float;
-use int::Int;
+use crate::float::Float;
+use crate::int::Int;
 
 /// Returns `a` raised to the power `b`
 fn pow<F: Float>(a: F, b: i32) -> F {
@@ -26,11 +26,22 @@ fn pow<F: Float>(a: F, b: i32) -> F {
 }
 
 intrinsics! {
+    #[avr_skip]
     pub extern "C" fn __powisf2(a: f32, b: i32) -> f32 {
         pow(a, b)
     }
 
+    #[avr_skip]
     pub extern "C" fn __powidf2(a: f64, b: i32) -> f64 {
         pow(a, b)
     }
+
+    #[avr_skip]
+    #[ppc_alias = __powikf2]
+    #[cfg(f128_enabled)]
+    // FIXME(f16_f128): MSVC cannot build these until `__divtf3` is available in nightly.
+    #[cfg(not(target_env = "msvc"))]
+    pub extern "C" fn __powitf2(a: f128, b: i32) -> f128 {
+        pow(a, b)
+    }
 }
diff --git a/src/float/sub.rs b/src/float/sub.rs
index 8d300e9d..7e8a8945 100644
--- a/src/float/sub.rs
+++ b/src/float/sub.rs
@@ -1,25 +1,26 @@
-use float::add::__adddf3;
-use float::add::__addsf3;
-use float::Float;
+use crate::float::Float;
 
 intrinsics! {
+    #[avr_skip]
     #[arm_aeabi_alias = __aeabi_fsub]
     pub extern "C" fn __subsf3(a: f32, b: f32) -> f32 {
-        __addsf3(a, f32::from_repr(b.repr() ^ f32::SIGN_MASK))
+        crate::float::add::__addsf3(a, f32::from_repr(b.repr() ^ f32::SIGN_MASK))
     }
 
+    #[avr_skip]
     #[arm_aeabi_alias = __aeabi_dsub]
     pub extern "C" fn __subdf3(a: f64, b: f64) -> f64 {
-        __adddf3(a, f64::from_repr(b.repr() ^ f64::SIGN_MASK))
+        crate::float::add::__adddf3(a, f64::from_repr(b.repr() ^ f64::SIGN_MASK))
     }
 
-    #[cfg(target_arch = "arm")]
-    pub extern "C" fn __subsf3vfp(a: f32, b: f32) -> f32 {
-        a - b
-    }
+    #[ppc_alias = __subkf3]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __subtf3(a: f128, b: f128) -> f128 {
+        #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+        use crate::float::add::__addkf3 as __addtf3;
+        #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+        use crate::float::add::__addtf3;
 
-    #[cfg(target_arch = "arm")]
-    pub extern "C" fn __subdf3vfp(a: f64, b: f64) -> f64 {
-        a - b
+        __addtf3(a, f128::from_repr(b.repr() ^ f128::SIGN_MASK))
     }
 }
diff --git a/src/float/trunc.rs b/src/float/trunc.rs
index d7371308..a25b6eab 100644
--- a/src/float/trunc.rs
+++ b/src/float/trunc.rs
@@ -1,5 +1,5 @@
-use float::Float;
-use int::{CastInto, Int};
+use crate::float::Float;
+use crate::int::{CastInto, Int, MinInt};
 
 fn trunc<F: Float, R: Float>(a: F) -> R
 where
@@ -52,8 +52,10 @@ where
         // destination format.  We can convert by simply right-shifting with
         // rounding and adjusting the exponent.
         abs_result = (a_abs >> sign_bits_delta).cast();
-        let tmp = src_exp_bias.wrapping_sub(dst_exp_bias) << R::SIGNIFICAND_BITS;
-        abs_result = abs_result.wrapping_sub(tmp.cast());
+        // Cast before shifting to prevent overflow.
+        let bias_diff: R::Int = src_exp_bias.wrapping_sub(dst_exp_bias).cast();
+        let tmp = bias_diff << R::SIGNIFICAND_BITS;
+        abs_result = abs_result.wrapping_sub(tmp);
 
         let round_bits = a_abs & round_mask;
         if round_bits > halfway {
@@ -67,13 +69,17 @@ where
         // a is NaN.
         // Conjure the result by beginning with infinity, setting the qNaN
         // bit and inserting the (truncated) trailing NaN field.
-        abs_result = (dst_inf_exp << R::SIGNIFICAND_BITS).cast();
+        // Cast before shifting to prevent overflow.
+        let dst_inf_exp: R::Int = dst_inf_exp.cast();
+        abs_result = dst_inf_exp << R::SIGNIFICAND_BITS;
         abs_result |= dst_qnan;
         abs_result |= dst_nan_code
             & ((a_abs & src_nan_code) >> (F::SIGNIFICAND_BITS - R::SIGNIFICAND_BITS)).cast();
     } else if a_abs >= overflow {
         // a overflows to infinity.
-        abs_result = (dst_inf_exp << R::SIGNIFICAND_BITS).cast();
+        // Cast before shifting to prevent overflow.
+        let dst_inf_exp: R::Int = dst_inf_exp.cast();
+        abs_result = dst_inf_exp << R::SIGNIFICAND_BITS;
     } else {
         // a underflows on conversion to the destination type or is an exact
         // zero.  The result may be a denormal or zero.  Extract the exponent
@@ -112,14 +118,62 @@ where
 }
 
 intrinsics! {
+    #[avr_skip]
     #[aapcs_on_arm]
     #[arm_aeabi_alias = __aeabi_d2f]
     pub extern "C" fn __truncdfsf2(a: f64) -> f32 {
         trunc(a)
     }
+}
+
+intrinsics! {
+    #[avr_skip]
+    #[aapcs_on_arm]
+    #[apple_f16_ret_abi]
+    #[arm_aeabi_alias = __aeabi_f2h]
+    #[cfg(f16_enabled)]
+    pub extern "C" fn __truncsfhf2(a: f32) -> f16 {
+        trunc(a)
+    }
+
+    #[avr_skip]
+    #[aapcs_on_arm]
+    #[apple_f16_ret_abi]
+    #[cfg(f16_enabled)]
+    pub extern "C" fn __gnu_f2h_ieee(a: f32) -> f16 {
+        trunc(a)
+    }
+
+    #[avr_skip]
+    #[aapcs_on_arm]
+    #[apple_f16_ret_abi]
+    #[arm_aeabi_alias = __aeabi_d2h]
+    #[cfg(f16_enabled)]
+    pub extern "C" fn __truncdfhf2(a: f64) -> f16 {
+        trunc(a)
+    }
+
+    #[avr_skip]
+    #[aapcs_on_arm]
+    #[ppc_alias = __trunckfhf2]
+    #[cfg(all(f16_enabled, f128_enabled))]
+    pub extern "C" fn __trunctfhf2(a: f128) -> f16 {
+        trunc(a)
+    }
 
-    #[cfg(target_arch = "arm")]
-    pub extern "C" fn __truncdfsf2vfp(a: f64) -> f32 {
-        a as f32
+    #[avr_skip]
+    #[aapcs_on_arm]
+    #[ppc_alias = __trunckfsf2]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __trunctfsf2(a: f128) -> f32 {
+        trunc(a)
+    }
+
+    #[avr_skip]
+    #[aapcs_on_arm]
+    #[ppc_alias = __trunckfdf2]
+    #[cfg(f128_enabled)]
+    pub extern "C" fn __trunctfdf2(a: f128) -> f64 {
+        trunc(a)
     }
 }
diff --git a/src/hexagon.rs b/src/hexagon.rs
new file mode 100644
index 00000000..91cf91c3
--- /dev/null
+++ b/src/hexagon.rs
@@ -0,0 +1,55 @@
+#![cfg(not(feature = "no-asm"))]
+
+use core::arch::global_asm;
+
+global_asm!(include_str!("hexagon/func_macro.s"), options(raw));
+
+global_asm!(include_str!("hexagon/dfaddsub.s"), options(raw));
+
+global_asm!(include_str!("hexagon/dfdiv.s"), options(raw));
+
+global_asm!(include_str!("hexagon/dffma.s"), options(raw));
+
+global_asm!(include_str!("hexagon/dfminmax.s"), options(raw));
+
+global_asm!(include_str!("hexagon/dfmul.s"), options(raw));
+
+global_asm!(include_str!("hexagon/dfsqrt.s"), options(raw));
+
+global_asm!(include_str!("hexagon/divdi3.s"), options(raw));
+
+global_asm!(include_str!("hexagon/divsi3.s"), options(raw));
+
+global_asm!(include_str!("hexagon/fastmath2_dlib_asm.s"), options(raw));
+
+global_asm!(include_str!("hexagon/fastmath2_ldlib_asm.s"), options(raw));
+
+global_asm!(
+    include_str!("hexagon/memcpy_forward_vp4cp4n2.s"),
+    options(raw)
+);
+
+global_asm!(
+    include_str!("hexagon/memcpy_likely_aligned.s"),
+    options(raw)
+);
+
+global_asm!(include_str!("hexagon/moddi3.s"), options(raw));
+
+global_asm!(include_str!("hexagon/modsi3.s"), options(raw));
+
+global_asm!(include_str!("hexagon/sfdiv_opt.s"), options(raw));
+
+global_asm!(include_str!("hexagon/sfsqrt_opt.s"), options(raw));
+
+global_asm!(include_str!("hexagon/udivdi3.s"), options(raw));
+
+global_asm!(include_str!("hexagon/udivmoddi4.s"), options(raw));
+
+global_asm!(include_str!("hexagon/udivmodsi4.s"), options(raw));
+
+global_asm!(include_str!("hexagon/udivsi3.s"), options(raw));
+
+global_asm!(include_str!("hexagon/umoddi3.s"), options(raw));
+
+global_asm!(include_str!("hexagon/umodsi3.s"), options(raw));
diff --git a/src/hexagon/dfaddsub.s b/src/hexagon/dfaddsub.s
new file mode 100644
index 00000000..1f59e460
--- /dev/null
+++ b/src/hexagon/dfaddsub.s
@@ -0,0 +1,321 @@
+ .text
+ .global __hexagon_adddf3
+ .global __hexagon_subdf3
+ .type __hexagon_adddf3, @function
+ .type __hexagon_subdf3, @function
+
+.global __qdsp_adddf3 ; .set __qdsp_adddf3, __hexagon_adddf3
+.global __hexagon_fast_adddf3 ; .set __hexagon_fast_adddf3, __hexagon_adddf3
+.global __hexagon_fast2_adddf3 ; .set __hexagon_fast2_adddf3, __hexagon_adddf3
+.global __qdsp_subdf3 ; .set __qdsp_subdf3, __hexagon_subdf3
+.global __hexagon_fast_subdf3 ; .set __hexagon_fast_subdf3, __hexagon_subdf3
+.global __hexagon_fast2_subdf3 ; .set __hexagon_fast2_subdf3, __hexagon_subdf3
+
+ .p2align 5
+__hexagon_adddf3:
+ {
+  r4 = extractu(r1,#11,#20)
+  r5 = extractu(r3,#11,#20)
+  r13:12 = combine(##0x20000000,#0)
+ }
+ {
+  p3 = dfclass(r1:0,#2)
+  p3 = dfclass(r3:2,#2)
+  r9:8 = r13:12
+  p2 = cmp.gtu(r5,r4)
+ }
+ {
+  if (!p3) jump .Ladd_abnormal
+  if (p2) r1:0 = r3:2
+  if (p2) r3:2 = r1:0
+  if (p2) r5:4 = combine(r4,r5)
+ }
+ {
+  r13:12 = insert(r1:0,#52,#11 -2)
+  r9:8 = insert(r3:2,#52,#11 -2)
+  r15 = sub(r4,r5)
+  r7:6 = combine(#62,#1)
+ }
+
+
+
+
+
+.Ladd_continue:
+ {
+  r15 = min(r15,r7)
+
+  r11:10 = neg(r13:12)
+  p2 = cmp.gt(r1,#-1)
+  r14 = #0
+ }
+ {
+  if (!p2) r13:12 = r11:10
+  r11:10 = extractu(r9:8,r15:14)
+  r9:8 = ASR(r9:8,r15)
+
+
+
+
+  r15:14 = #0
+ }
+ {
+  p1 = cmp.eq(r11:10,r15:14)
+  if (!p1.new) r8 = or(r8,r6)
+  r5 = add(r4,#-1024 -60)
+  p3 = cmp.gt(r3,#-1)
+ }
+ {
+  r13:12 = add(r13:12,r9:8)
+  r11:10 = sub(r13:12,r9:8)
+  r7:6 = combine(#54,##2045)
+ }
+ {
+  p0 = cmp.gtu(r4,r7)
+  p0 = !cmp.gtu(r4,r6)
+  if (!p0.new) jump:nt .Ladd_ovf_unf
+  if (!p3) r13:12 = r11:10
+ }
+ {
+  r1:0 = convert_d2df(r13:12)
+  p0 = cmp.eq(r13,#0)
+  p0 = cmp.eq(r12,#0)
+  if (p0.new) jump:nt .Ladd_zero
+ }
+ {
+  r1 += asl(r5,#20)
+  jumpr r31
+ }
+ .falign
+__hexagon_subdf3:
+ {
+  r3 = togglebit(r3,#31)
+  jump __qdsp_adddf3
+ }
+
+
+ .falign
+.Ladd_zero:
+
+
+ {
+  r28 = USR
+  r1:0 = #0
+  r3 = #1
+ }
+ {
+  r28 = extractu(r28,#2,#22)
+  r3 = asl(r3,#31)
+ }
+ {
+  p0 = cmp.eq(r28,#2)
+  if (p0.new) r1 = xor(r1,r3)
+  jumpr r31
+ }
+ .falign
+.Ladd_ovf_unf:
+ {
+  r1:0 = convert_d2df(r13:12)
+  p0 = cmp.eq(r13,#0)
+  p0 = cmp.eq(r12,#0)
+  if (p0.new) jump:nt .Ladd_zero
+ }
+ {
+  r28 = extractu(r1,#11,#20)
+  r1 += asl(r5,#20)
+ }
+ {
+  r5 = add(r5,r28)
+  r3:2 = combine(##0x00100000,#0)
+ }
+ {
+  p0 = cmp.gt(r5,##1024 +1024 -2)
+  if (p0.new) jump:nt .Ladd_ovf
+ }
+ {
+  p0 = cmp.gt(r5,#0)
+  if (p0.new) jumpr:t r31
+  r28 = sub(#1,r5)
+ }
+ {
+  r3:2 = insert(r1:0,#52,#0)
+  r1:0 = r13:12
+ }
+ {
+  r3:2 = lsr(r3:2,r28)
+ }
+ {
+  r1:0 = insert(r3:2,#63,#0)
+  jumpr r31
+ }
+ .falign
+.Ladd_ovf:
+
+ {
+  r1:0 = r13:12
+  r28 = USR
+  r13:12 = combine(##0x7fefffff,#-1)
+ }
+ {
+  r5 = extractu(r28,#2,#22)
+  r28 = or(r28,#0x28)
+  r9:8 = combine(##0x7ff00000,#0)
+ }
+ {
+  USR = r28
+  r5 ^= lsr(r1,#31)
+  r28 = r5
+ }
+ {
+  p0 = !cmp.eq(r28,#1)
+  p0 = !cmp.eq(r5,#2)
+  if (p0.new) r13:12 = r9:8
+ }
+ {
+  r1:0 = insert(r13:12,#63,#0)
+ }
+ {
+  p0 = dfcmp.eq(r1:0,r1:0)
+  jumpr r31
+ }
+
+.Ladd_abnormal:
+ {
+  r13:12 = extractu(r1:0,#63,#0)
+  r9:8 = extractu(r3:2,#63,#0)
+ }
+ {
+  p3 = cmp.gtu(r13:12,r9:8)
+  if (!p3.new) r1:0 = r3:2
+  if (!p3.new) r3:2 = r1:0
+ }
+ {
+
+  p0 = dfclass(r1:0,#0x0f)
+  if (!p0.new) jump:nt .Linvalid_nan_add
+  if (!p3) r13:12 = r9:8
+  if (!p3) r9:8 = r13:12
+ }
+ {
+
+
+  p1 = dfclass(r1:0,#0x08)
+  if (p1.new) jump:nt .Linf_add
+ }
+ {
+  p2 = dfclass(r3:2,#0x01)
+  if (p2.new) jump:nt .LB_zero
+  r13:12 = #0
+ }
+
+ {
+  p0 = dfclass(r1:0,#4)
+  if (p0.new) jump:nt .Ladd_two_subnormal
+  r13:12 = combine(##0x20000000,#0)
+ }
+ {
+  r4 = extractu(r1,#11,#20)
+  r5 = #1
+
+  r9:8 = asl(r9:8,#11 -2)
+ }
+
+
+
+ {
+  r13:12 = insert(r1:0,#52,#11 -2)
+  r15 = sub(r4,r5)
+  r7:6 = combine(#62,#1)
+  jump .Ladd_continue
+ }
+
+.Ladd_two_subnormal:
+ {
+  r13:12 = extractu(r1:0,#63,#0)
+  r9:8 = extractu(r3:2,#63,#0)
+ }
+ {
+  r13:12 = neg(r13:12)
+  r9:8 = neg(r9:8)
+  p0 = cmp.gt(r1,#-1)
+  p1 = cmp.gt(r3,#-1)
+ }
+ {
+  if (p0) r13:12 = r1:0
+  if (p1) r9:8 = r3:2
+ }
+ {
+  r13:12 = add(r13:12,r9:8)
+ }
+ {
+  r9:8 = neg(r13:12)
+  p0 = cmp.gt(r13,#-1)
+  r3:2 = #0
+ }
+ {
+  if (!p0) r1:0 = r9:8
+  if (p0) r1:0 = r13:12
+  r3 = ##0x80000000
+ }
+ {
+  if (!p0) r1 = or(r1,r3)
+  p0 = dfcmp.eq(r1:0,r3:2)
+  if (p0.new) jump:nt .Lzero_plus_zero
+ }
+ {
+  jumpr r31
+ }
+
+.Linvalid_nan_add:
+ {
+  r28 = convert_df2sf(r1:0)
+  p0 = dfclass(r3:2,#0x0f)
+  if (p0.new) r3:2 = r1:0
+ }
+ {
+  r2 = convert_df2sf(r3:2)
+  r1:0 = #-1
+  jumpr r31
+ }
+ .falign
+.LB_zero:
+ {
+  p0 = dfcmp.eq(r13:12,r1:0)
+  if (!p0.new) jumpr:t r31
+ }
+
+
+
+
+.Lzero_plus_zero:
+ {
+  p0 = cmp.eq(r1:0,r3:2)
+  if (p0.new) jumpr:t r31
+ }
+ {
+  r28 = USR
+ }
+ {
+  r28 = extractu(r28,#2,#22)
+  r1:0 = #0
+ }
+ {
+  p0 = cmp.eq(r28,#2)
+  if (p0.new) r1 = ##0x80000000
+  jumpr r31
+ }
+.Linf_add:
+
+ {
+  p0 = !cmp.eq(r1,r3)
+  p0 = dfclass(r3:2,#8)
+  if (!p0.new) jumpr:t r31
+ }
+ {
+  r2 = ##0x7f800001
+ }
+ {
+  r1:0 = convert_sf2df(r2)
+  jumpr r31
+ }
+.size __hexagon_adddf3,.-__hexagon_adddf3
diff --git a/src/hexagon/dfdiv.s b/src/hexagon/dfdiv.s
new file mode 100644
index 00000000..6d65dbfc
--- /dev/null
+++ b/src/hexagon/dfdiv.s
@@ -0,0 +1,372 @@
+ .text
+ .global __hexagon_divdf3
+ .type __hexagon_divdf3,@function
+ .global __qdsp_divdf3 ; .set __qdsp_divdf3, __hexagon_divdf3
+        .global __hexagon_fast_divdf3 ; .set __hexagon_fast_divdf3, __hexagon_divdf3
+        .global __hexagon_fast2_divdf3 ; .set __hexagon_fast2_divdf3, __hexagon_divdf3
+ .p2align 5
+__hexagon_divdf3:
+ {
+  p2 = dfclass(r1:0,#0x02)
+  p2 = dfclass(r3:2,#0x02)
+  r13:12 = combine(r3,r1)
+  r28 = xor(r1,r3)
+ }
+ {
+  if (!p2) jump .Ldiv_abnormal
+  r7:6 = extractu(r3:2,#23,#52 -23)
+  r8 = ##0x3f800001
+ }
+ {
+  r9 = or(r8,r6)
+  r13 = extractu(r13,#11,#52 -32)
+  r12 = extractu(r12,#11,#52 -32)
+  p3 = cmp.gt(r28,#-1)
+ }
+
+
+.Ldenorm_continue:
+ {
+  r11,p0 = sfrecipa(r8,r9)
+  r10 = and(r8,#-2)
+  r28 = #1
+  r12 = sub(r12,r13)
+ }
+
+
+ {
+  r10 -= sfmpy(r11,r9):lib
+  r1 = insert(r28,#11 +1,#52 -32)
+  r13 = ##0x00800000 << 3
+ }
+ {
+  r11 += sfmpy(r11,r10):lib
+  r3 = insert(r28,#11 +1,#52 -32)
+  r10 = and(r8,#-2)
+ }
+ {
+  r10 -= sfmpy(r11,r9):lib
+  r5 = #-0x3ff +1
+  r4 = #0x3ff -1
+ }
+ {
+  r11 += sfmpy(r11,r10):lib
+  p1 = cmp.gt(r12,r5)
+  p1 = !cmp.gt(r12,r4)
+ }
+ {
+  r13 = insert(r11,#23,#3)
+  r5:4 = #0
+  r12 = add(r12,#-61)
+ }
+
+
+
+
+ {
+  r13 = add(r13,#((-3) << 3))
+ }
+ { r7:6 = mpyu(r13,r1); r1:0 = asl(r1:0,# ( 15 )); }; { r6 = # 0; r1:0 -= mpyu(r7,r2); r15:14 = mpyu(r7,r3); }; { r5:4 += ASL(r7:6, # ( 14 )); r1:0 -= asl(r15:14, # 32); }
+ { r7:6 = mpyu(r13,r1); r1:0 = asl(r1:0,# ( 15 )); }; { r6 = # 0; r1:0 -= mpyu(r7,r2); r15:14 = mpyu(r7,r3); }; { r5:4 += ASR(r7:6, # ( 1 )); r1:0 -= asl(r15:14, # 32); }
+ { r7:6 = mpyu(r13,r1); r1:0 = asl(r1:0,# ( 15 )); }; { r6 = # 0; r1:0 -= mpyu(r7,r2); r15:14 = mpyu(r7,r3); }; { r5:4 += ASR(r7:6, # ( 16 )); r1:0 -= asl(r15:14, # 32); }
+ { r7:6 = mpyu(r13,r1); r1:0 = asl(r1:0,# ( 15 )); }; { r6 = # 0; r1:0 -= mpyu(r7,r2); r15:14 = mpyu(r7,r3); }; { r5:4 += ASR(r7:6, # ( 31 )); r1:0 -= asl(r15:14, # 32); r7:6=# ( 0 ); }
+
+
+
+
+
+
+
+ {
+
+  r15:14 = sub(r1:0,r3:2)
+  p0 = cmp.gtu(r3:2,r1:0)
+
+  if (!p0.new) r6 = #2
+ }
+ {
+  r5:4 = add(r5:4,r7:6)
+  if (!p0) r1:0 = r15:14
+  r15:14 = #0
+ }
+ {
+  p0 = cmp.eq(r1:0,r15:14)
+  if (!p0.new) r4 = or(r4,r28)
+ }
+ {
+  r7:6 = neg(r5:4)
+ }
+ {
+  if (!p3) r5:4 = r7:6
+ }
+ {
+  r1:0 = convert_d2df(r5:4)
+  if (!p1) jump .Ldiv_ovf_unf
+ }
+ {
+  r1 += asl(r12,#52 -32)
+  jumpr r31
+ }
+
+.Ldiv_ovf_unf:
+ {
+  r1 += asl(r12,#52 -32)
+  r13 = extractu(r1,#11,#52 -32)
+ }
+ {
+  r7:6 = abs(r5:4)
+  r12 = add(r12,r13)
+ }
+ {
+  p0 = cmp.gt(r12,##0x3ff +0x3ff)
+  if (p0.new) jump:nt .Ldiv_ovf
+ }
+ {
+  p0 = cmp.gt(r12,#0)
+  if (p0.new) jump:nt .Lpossible_unf2
+ }
+ {
+  r13 = add(clb(r7:6),#-1)
+  r12 = sub(#7,r12)
+  r10 = USR
+  r11 = #63
+ }
+ {
+  r13 = min(r12,r11)
+  r11 = or(r10,#0x030)
+  r7:6 = asl(r7:6,r13)
+  r12 = #0
+ }
+ {
+  r15:14 = extractu(r7:6,r13:12)
+  r7:6 = lsr(r7:6,r13)
+  r3:2 = #1
+ }
+ {
+  p0 = cmp.gtu(r3:2,r15:14)
+  if (!p0.new) r6 = or(r2,r6)
+  r7 = setbit(r7,#52 -32+4)
+ }
+ {
+  r5:4 = neg(r7:6)
+  p0 = bitsclr(r6,#(1<<4)-1)
+  if (!p0.new) r10 = r11
+ }
+ {
+  USR = r10
+  if (p3) r5:4 = r7:6
+  r10 = #-0x3ff -(52 +4)
+ }
+ {
+  r1:0 = convert_d2df(r5:4)
+ }
+ {
+  r1 += asl(r10,#52 -32)
+  jumpr r31
+ }
+
+
+.Lpossible_unf2:
+
+
+ {
+  r3:2 = extractu(r1:0,#63,#0)
+  r15:14 = combine(##0x00100000,#0)
+  r10 = #0x7FFF
+ }
+ {
+  p0 = dfcmp.eq(r15:14,r3:2)
+  p0 = bitsset(r7,r10)
+ }
+
+
+
+
+
+
+ {
+  if (!p0) jumpr r31
+  r10 = USR
+ }
+
+ {
+  r10 = or(r10,#0x30)
+ }
+ {
+  USR = r10
+ }
+ {
+  p0 = dfcmp.eq(r1:0,r1:0)
+  jumpr r31
+ }
+
+.Ldiv_ovf:
+
+
+
+ {
+  r10 = USR
+  r3:2 = combine(##0x7fefffff,#-1)
+  r1 = mux(p3,#0,#-1)
+ }
+ {
+  r7:6 = combine(##0x7ff00000,#0)
+  r5 = extractu(r10,#2,#22)
+  r10 = or(r10,#0x28)
+ }
+ {
+  USR = r10
+  r5 ^= lsr(r1,#31)
+  r4 = r5
+ }
+ {
+  p0 = !cmp.eq(r4,#1)
+  p0 = !cmp.eq(r5,#2)
+  if (p0.new) r3:2 = r7:6
+  p0 = dfcmp.eq(r3:2,r3:2)
+ }
+ {
+  r1:0 = insert(r3:2,#63,#0)
+  jumpr r31
+ }
+
+
+
+
+
+
+
+.Ldiv_abnormal:
+ {
+  p0 = dfclass(r1:0,#0x0F)
+  p0 = dfclass(r3:2,#0x0F)
+  p3 = cmp.gt(r28,#-1)
+ }
+ {
+  p1 = dfclass(r1:0,#0x08)
+  p1 = dfclass(r3:2,#0x08)
+ }
+ {
+  p2 = dfclass(r1:0,#0x01)
+  p2 = dfclass(r3:2,#0x01)
+ }
+ {
+  if (!p0) jump .Ldiv_nan
+  if (p1) jump .Ldiv_invalid
+ }
+ {
+  if (p2) jump .Ldiv_invalid
+ }
+ {
+  p2 = dfclass(r1:0,#(0x0F ^ 0x01))
+  p2 = dfclass(r3:2,#(0x0F ^ 0x08))
+ }
+ {
+  p1 = dfclass(r1:0,#(0x0F ^ 0x08))
+  p1 = dfclass(r3:2,#(0x0F ^ 0x01))
+ }
+ {
+  if (!p2) jump .Ldiv_zero_result
+  if (!p1) jump .Ldiv_inf_result
+ }
+
+
+
+
+
+ {
+  p0 = dfclass(r1:0,#0x02)
+  p1 = dfclass(r3:2,#0x02)
+  r10 = ##0x00100000
+ }
+ {
+  r13:12 = combine(r3,r1)
+  r1 = insert(r10,#11 +1,#52 -32)
+  r3 = insert(r10,#11 +1,#52 -32)
+ }
+ {
+  if (p0) r1 = or(r1,r10)
+  if (p1) r3 = or(r3,r10)
+ }
+ {
+  r5 = add(clb(r1:0),#-11)
+  r4 = add(clb(r3:2),#-11)
+  r10 = #1
+ }
+ {
+  r12 = extractu(r12,#11,#52 -32)
+  r13 = extractu(r13,#11,#52 -32)
+ }
+ {
+  r1:0 = asl(r1:0,r5)
+  r3:2 = asl(r3:2,r4)
+  if (!p0) r12 = sub(r10,r5)
+  if (!p1) r13 = sub(r10,r4)
+ }
+ {
+  r7:6 = extractu(r3:2,#23,#52 -23)
+ }
+ {
+  r9 = or(r8,r6)
+  jump .Ldenorm_continue
+ }
+
+.Ldiv_zero_result:
+ {
+  r1 = xor(r1,r3)
+  r3:2 = #0
+ }
+ {
+  r1:0 = insert(r3:2,#63,#0)
+  jumpr r31
+ }
+.Ldiv_inf_result:
+ {
+  p2 = dfclass(r3:2,#0x01)
+  p2 = dfclass(r1:0,#(0x0F ^ 0x08))
+ }
+ {
+  r10 = USR
+  if (!p2) jump 1f
+  r1 = xor(r1,r3)
+ }
+ {
+  r10 = or(r10,#0x04)
+ }
+ {
+  USR = r10
+ }
+1:
+ {
+  r3:2 = combine(##0x7ff00000,#0)
+  p0 = dfcmp.uo(r3:2,r3:2)
+ }
+ {
+  r1:0 = insert(r3:2,#63,#0)
+  jumpr r31
+ }
+.Ldiv_nan:
+ {
+  p0 = dfclass(r1:0,#0x10)
+  p1 = dfclass(r3:2,#0x10)
+  if (!p0.new) r1:0 = r3:2
+  if (!p1.new) r3:2 = r1:0
+ }
+ {
+  r5 = convert_df2sf(r1:0)
+  r4 = convert_df2sf(r3:2)
+ }
+ {
+  r1:0 = #-1
+  jumpr r31
+ }
+
+.Ldiv_invalid:
+ {
+  r10 = ##0x7f800001
+ }
+ {
+  r1:0 = convert_sf2df(r10)
+  jumpr r31
+ }
+.size __hexagon_divdf3,.-__hexagon_divdf3
diff --git a/src/hexagon/dffma.s b/src/hexagon/dffma.s
new file mode 100644
index 00000000..97d05eb1
--- /dev/null
+++ b/src/hexagon/dffma.s
@@ -0,0 +1,534 @@
+ .text
+ .global __hexagon_fmadf4
+        .type __hexagon_fmadf4,@function
+ .global __hexagon_fmadf5
+        .type __hexagon_fmadf5,@function
+ .global __qdsp_fmadf5 ; .set __qdsp_fmadf5, __hexagon_fmadf5
+ .p2align 5
+__hexagon_fmadf4:
+__hexagon_fmadf5:
+fma:
+ {
+  p0 = dfclass(r1:0,#2)
+  p0 = dfclass(r3:2,#2)
+  r13:12 = #0
+  r15:14 = #0
+ }
+ {
+  r13:12 = insert(r1:0,#52,#11 -3)
+  r15:14 = insert(r3:2,#52,#11 -3)
+  r7 = ##0x10000000
+  allocframe(#32)
+ }
+ {
+  r9:8 = mpyu(r12,r14)
+  if (!p0) jump .Lfma_abnormal_ab
+  r13 = or(r13,r7)
+  r15 = or(r15,r7)
+ }
+ {
+  p0 = dfclass(r5:4,#2)
+  if (!p0.new) jump:nt .Lfma_abnormal_c
+  r11:10 = combine(r7,#0)
+  r7:6 = combine(#0,r9)
+ }
+.Lfma_abnormal_c_restart:
+ {
+  r7:6 += mpyu(r14,r13)
+  r11:10 = insert(r5:4,#52,#11 -3)
+  memd(r29+#0) = r17:16
+  memd(r29+#8) = r19:18
+ }
+ {
+  r7:6 += mpyu(r12,r15)
+  r19:18 = neg(r11:10)
+  p0 = cmp.gt(r5,#-1)
+  r28 = xor(r1,r3)
+ }
+ {
+  r18 = extractu(r1,#11,#20)
+  r19 = extractu(r3,#11,#20)
+  r17:16 = combine(#0,r7)
+  if (!p0) r11:10 = r19:18
+ }
+ {
+  r17:16 += mpyu(r13,r15)
+  r9:8 = combine(r6,r8)
+  r18 = add(r18,r19)
+
+
+
+
+  r19 = extractu(r5,#11,#20)
+ }
+ {
+  r18 = add(r18,#-1023 +(4))
+  p3 = !cmp.gt(r28,#-1)
+  r7:6 = #0
+  r15:14 = #0
+ }
+ {
+  r7:6 = sub(r7:6,r9:8,p3):carry
+  p0 = !cmp.gt(r28,#-1)
+  p1 = cmp.gt(r19,r18)
+  if (p1.new) r19:18 = combine(r18,r19)
+ }
+ {
+  r15:14 = sub(r15:14,r17:16,p3):carry
+  if (p0) r9:8 = r7:6
+
+
+
+
+  r7:6 = #0
+  r19 = sub(r18,r19)
+ }
+ {
+  if (p0) r17:16 = r15:14
+  p0 = cmp.gt(r19,#63)
+  if (p1) r9:8 = r7:6
+  if (p1) r7:6 = r9:8
+ }
+
+
+
+
+
+
+
+ {
+  if (p1) r17:16 = r11:10
+  if (p1) r11:10 = r17:16
+  if (p0) r19 = add(r19,#-64)
+  r28 = #63
+ }
+ {
+
+  if (p0) r7:6 = r11:10
+  r28 = asr(r11,#31)
+  r13 = min(r19,r28)
+  r12 = #0
+ }
+
+
+
+
+
+
+ {
+  if (p0) r11:10 = combine(r28,r28)
+  r5:4 = extract(r7:6,r13:12)
+  r7:6 = lsr(r7:6,r13)
+  r12 = sub(#64,r13)
+ }
+ {
+  r15:14 = #0
+  r28 = #-2
+  r7:6 |= lsl(r11:10,r12)
+  r11:10 = asr(r11:10,r13)
+ }
+ {
+  p3 = cmp.gtu(r5:4,r15:14)
+  if (p3.new) r6 = and(r6,r28)
+
+
+
+  r15:14 = #1
+  r5:4 = #0
+ }
+ {
+  r9:8 = add(r7:6,r9:8,p3):carry
+ }
+ {
+  r17:16 = add(r11:10,r17:16,p3):carry
+  r28 = #62
+ }
+
+
+
+
+
+
+
+ {
+  r12 = add(clb(r17:16),#-2)
+  if (!cmp.eq(r12.new,r28)) jump:t 1f
+ }
+
+ {
+  r11:10 = extractu(r9:8,#62,#2)
+  r9:8 = asl(r9:8,#62)
+  r18 = add(r18,#-62)
+ }
+ {
+  r17:16 = insert(r11:10,#62,#0)
+ }
+ {
+  r12 = add(clb(r17:16),#-2)
+ }
+ .falign
+1:
+ {
+  r11:10 = asl(r17:16,r12)
+  r5:4 |= asl(r9:8,r12)
+  r13 = sub(#64,r12)
+  r18 = sub(r18,r12)
+ }
+ {
+  r11:10 |= lsr(r9:8,r13)
+  p2 = cmp.gtu(r15:14,r5:4)
+  r28 = #1023 +1023 -2
+ }
+ {
+  if (!p2) r10 = or(r10,r14)
+
+  p0 = !cmp.gt(r18,r28)
+  p0 = cmp.gt(r18,#1)
+  if (!p0.new) jump:nt .Lfma_ovf_unf
+ }
+ {
+
+  p0 = cmp.gtu(r15:14,r11:10)
+  r1:0 = convert_d2df(r11:10)
+  r18 = add(r18,#-1023 -60)
+  r17:16 = memd(r29+#0)
+ }
+ {
+  r1 += asl(r18,#20)
+  r19:18 = memd(r29+#8)
+  if (!p0) dealloc_return
+ }
+.Ladd_yields_zero:
+
+ {
+  r28 = USR
+  r1:0 = #0
+ }
+ {
+  r28 = extractu(r28,#2,#22)
+  r17:16 = memd(r29+#0)
+  r19:18 = memd(r29+#8)
+ }
+ {
+  p0 = cmp.eq(r28,#2)
+  if (p0.new) r1 = ##0x80000000
+  dealloc_return
+ }
+.Lfma_ovf_unf:
+ {
+  p0 = cmp.gtu(r15:14,r11:10)
+  if (p0.new) jump:nt .Ladd_yields_zero
+ }
+ {
+  r1:0 = convert_d2df(r11:10)
+  r18 = add(r18,#-1023 -60)
+  r28 = r18
+ }
+
+
+ {
+  r1 += asl(r18,#20)
+  r7 = extractu(r1,#11,#20)
+ }
+ {
+  r6 = add(r18,r7)
+  r17:16 = memd(r29+#0)
+  r19:18 = memd(r29+#8)
+  r9:8 = abs(r11:10)
+ }
+ {
+  p0 = cmp.gt(r6,##1023 +1023)
+  if (p0.new) jump:nt .Lfma_ovf
+ }
+ {
+  p0 = cmp.gt(r6,#0)
+  if (p0.new) jump:nt .Lpossible_unf0
+ }
+ {
+
+
+
+  r7 = add(clb(r9:8),#-2)
+  r6 = sub(#1+5,r28)
+  p3 = cmp.gt(r11,#-1)
+ }
+
+
+
+ {
+  r6 = add(r6,r7)
+  r9:8 = asl(r9:8,r7)
+  r1 = USR
+  r28 = #63
+ }
+ {
+  r7 = min(r6,r28)
+  r6 = #0
+  r0 = #0x0030
+ }
+ {
+  r3:2 = extractu(r9:8,r7:6)
+  r9:8 = asr(r9:8,r7)
+ }
+ {
+  p0 = cmp.gtu(r15:14,r3:2)
+  if (!p0.new) r8 = or(r8,r14)
+  r9 = setbit(r9,#20 +3)
+ }
+ {
+  r11:10 = neg(r9:8)
+  p1 = bitsclr(r8,#(1<<3)-1)
+  if (!p1.new) r1 = or(r1,r0)
+  r3:2 = #0
+ }
+ {
+  if (p3) r11:10 = r9:8
+  USR = r1
+  r28 = #-1023 -(52 +3)
+ }
+ {
+  r1:0 = convert_d2df(r11:10)
+ }
+ {
+  r1 += asl(r28,#20)
+  dealloc_return
+ }
+.Lpossible_unf0:
+ {
+  r28 = ##0x7fefffff
+  r9:8 = abs(r11:10)
+ }
+ {
+  p0 = cmp.eq(r0,#0)
+  p0 = bitsclr(r1,r28)
+  if (!p0.new) dealloc_return:t
+  r28 = #0x7fff
+ }
+ {
+  p0 = bitsset(r9,r28)
+  r3 = USR
+  r2 = #0x0030
+ }
+ {
+  if (p0) r3 = or(r3,r2)
+ }
+ {
+  USR = r3
+ }
+ {
+  p0 = dfcmp.eq(r1:0,r1:0)
+  dealloc_return
+ }
+.Lfma_ovf:
+ {
+  r28 = USR
+  r11:10 = combine(##0x7fefffff,#-1)
+  r1:0 = r11:10
+ }
+ {
+  r9:8 = combine(##0x7ff00000,#0)
+  r3 = extractu(r28,#2,#22)
+  r28 = or(r28,#0x28)
+ }
+ {
+  USR = r28
+  r3 ^= lsr(r1,#31)
+  r2 = r3
+ }
+ {
+  p0 = !cmp.eq(r2,#1)
+  p0 = !cmp.eq(r3,#2)
+ }
+ {
+  p0 = dfcmp.eq(r9:8,r9:8)
+  if (p0.new) r11:10 = r9:8
+ }
+ {
+  r1:0 = insert(r11:10,#63,#0)
+  dealloc_return
+ }
+.Lfma_abnormal_ab:
+ {
+  r9:8 = extractu(r1:0,#63,#0)
+  r11:10 = extractu(r3:2,#63,#0)
+  deallocframe
+ }
+ {
+  p3 = cmp.gtu(r9:8,r11:10)
+  if (!p3.new) r1:0 = r3:2
+  if (!p3.new) r3:2 = r1:0
+ }
+ {
+  p0 = dfclass(r1:0,#0x0f)
+  if (!p0.new) jump:nt .Lnan
+  if (!p3) r9:8 = r11:10
+  if (!p3) r11:10 = r9:8
+ }
+ {
+  p1 = dfclass(r1:0,#0x08)
+  p1 = dfclass(r3:2,#0x0e)
+ }
+ {
+  p0 = dfclass(r1:0,#0x08)
+  p0 = dfclass(r3:2,#0x01)
+ }
+ {
+  if (p1) jump .Lab_inf
+  p2 = dfclass(r3:2,#0x01)
+ }
+ {
+  if (p0) jump .Linvalid
+  if (p2) jump .Lab_true_zero
+  r28 = ##0x7c000000
+ }
+
+
+
+
+
+ {
+  p0 = bitsclr(r1,r28)
+  if (p0.new) jump:nt .Lfma_ab_tiny
+ }
+ {
+  r28 = add(clb(r11:10),#-11)
+ }
+ {
+  r11:10 = asl(r11:10,r28)
+ }
+ {
+  r3:2 = insert(r11:10,#63,#0)
+  r1 -= asl(r28,#20)
+ }
+ jump fma
+
+.Lfma_ab_tiny:
+ r9:8 = combine(##0x00100000,#0)
+ {
+  r1:0 = insert(r9:8,#63,#0)
+  r3:2 = insert(r9:8,#63,#0)
+ }
+ jump fma
+
+.Lab_inf:
+ {
+  r3:2 = lsr(r3:2,#63)
+  p0 = dfclass(r5:4,#0x10)
+ }
+ {
+  r1:0 ^= asl(r3:2,#63)
+  if (p0) jump .Lnan
+ }
+ {
+  p1 = dfclass(r5:4,#0x08)
+  if (p1.new) jump:nt .Lfma_inf_plus_inf
+ }
+
+ {
+  jumpr r31
+ }
+ .falign
+.Lfma_inf_plus_inf:
+ {
+  p0 = dfcmp.eq(r1:0,r5:4)
+  if (!p0.new) jump:nt .Linvalid
+ }
+ {
+  jumpr r31
+ }
+
+.Lnan:
+ {
+  p0 = dfclass(r3:2,#0x10)
+  p1 = dfclass(r5:4,#0x10)
+  if (!p0.new) r3:2 = r1:0
+  if (!p1.new) r5:4 = r1:0
+ }
+ {
+  r3 = convert_df2sf(r3:2)
+  r2 = convert_df2sf(r5:4)
+ }
+ {
+  r3 = convert_df2sf(r1:0)
+  r1:0 = #-1
+  jumpr r31
+ }
+
+.Linvalid:
+ {
+  r28 = ##0x7f800001
+ }
+ {
+  r1:0 = convert_sf2df(r28)
+  jumpr r31
+ }
+
+.Lab_true_zero:
+
+ {
+  p0 = dfclass(r5:4,#0x10)
+  if (p0.new) jump:nt .Lnan
+  if (p0.new) r1:0 = r5:4
+ }
+ {
+  p0 = dfcmp.eq(r3:2,r5:4)
+  r1 = lsr(r1,#31)
+ }
+ {
+  r3 ^= asl(r1,#31)
+  if (!p0) r1:0 = r5:4
+  if (!p0) jumpr r31
+ }
+
+ {
+  p0 = cmp.eq(r3:2,r5:4)
+  if (p0.new) jumpr:t r31
+  r1:0 = r3:2
+ }
+ {
+  r28 = USR
+ }
+ {
+  r28 = extractu(r28,#2,#22)
+  r1:0 = #0
+ }
+ {
+  p0 = cmp.eq(r28,#2)
+  if (p0.new) r1 = ##0x80000000
+  jumpr r31
+ }
+
+
+
+
+ .falign
+.Lfma_abnormal_c:
+
+
+ {
+  p0 = dfclass(r5:4,#0x10)
+  if (p0.new) jump:nt .Lnan
+  if (p0.new) r1:0 = r5:4
+  deallocframe
+ }
+ {
+  p0 = dfclass(r5:4,#0x08)
+  if (p0.new) r1:0 = r5:4
+  if (p0.new) jumpr:nt r31
+ }
+
+
+ {
+  p0 = dfclass(r5:4,#0x01)
+  if (p0.new) jump:nt __hexagon_muldf3
+  r28 = #1
+ }
+
+
+ {
+  allocframe(#32)
+  r11:10 = #0
+  r5 = insert(r28,#11,#20)
+  jump .Lfma_abnormal_c_restart
+ }
+.size fma,.-fma
diff --git a/src/hexagon/dfminmax.s b/src/hexagon/dfminmax.s
new file mode 100644
index 00000000..953e773b
--- /dev/null
+++ b/src/hexagon/dfminmax.s
@@ -0,0 +1,45 @@
+ .text
+ .global __hexagon_mindf3
+ .global __hexagon_maxdf3
+ .type __hexagon_mindf3,@function
+ .type __hexagon_maxdf3,@function
+ .global __qdsp_mindf3 ; .set __qdsp_mindf3, __hexagon_mindf3
+ .global __qdsp_maxdf3 ; .set __qdsp_maxdf3, __hexagon_maxdf3
+ .p2align 5
+__hexagon_mindf3:
+ {
+  p0 = dfclass(r1:0,#0x10)
+  p1 = dfcmp.gt(r1:0,r3:2)
+  r5:4 = r1:0
+ }
+ {
+  if (p0) r1:0 = r3:2
+  if (p1) r1:0 = r3:2
+  p2 = dfcmp.eq(r1:0,r3:2)
+  if (!p2.new) jumpr:t r31
+ }
+
+ {
+  r1:0 = or(r5:4,r3:2)
+  jumpr r31
+ }
+.size __hexagon_mindf3,.-__hexagon_mindf3
+ .falign
+__hexagon_maxdf3:
+ {
+  p0 = dfclass(r1:0,#0x10)
+  p1 = dfcmp.gt(r3:2,r1:0)
+  r5:4 = r1:0
+ }
+ {
+  if (p0) r1:0 = r3:2
+  if (p1) r1:0 = r3:2
+  p2 = dfcmp.eq(r1:0,r3:2)
+  if (!p2.new) jumpr:t r31
+ }
+
+ {
+  r1:0 = and(r5:4,r3:2)
+  jumpr r31
+ }
+.size __hexagon_maxdf3,.-__hexagon_maxdf3
diff --git a/src/hexagon/dfmul.s b/src/hexagon/dfmul.s
new file mode 100644
index 00000000..32fc674f
--- /dev/null
+++ b/src/hexagon/dfmul.s
@@ -0,0 +1,309 @@
+ .text
+ .global __hexagon_muldf3
+ .type __hexagon_muldf3,@function
+ .global __qdsp_muldf3 ; .set __qdsp_muldf3, __hexagon_muldf3
+  .global __hexagon_fast_muldf3 ; .set __hexagon_fast_muldf3, __hexagon_muldf3
+  .global __hexagon_fast2_muldf3 ; .set __hexagon_fast2_muldf3, __hexagon_muldf3
+ .p2align 5
+__hexagon_muldf3:
+ {
+  p0 = dfclass(r1:0,#2)
+  p0 = dfclass(r3:2,#2)
+  r13:12 = combine(##0x40000000,#0)
+ }
+ {
+  r13:12 = insert(r1:0,#52,#11 -1)
+  r5:4 = asl(r3:2,#11 -1)
+  r28 = #-1024
+  r9:8 = #1
+ }
+ {
+  r7:6 = mpyu(r4,r13)
+  r5:4 = insert(r9:8,#2,#62)
+ }
+
+
+
+
+ {
+  r15:14 = mpyu(r12,r4)
+  r7:6 += mpyu(r12,r5)
+ }
+ {
+  r7:6 += lsr(r15:14,#32)
+  r11:10 = mpyu(r13,r5)
+  r5:4 = combine(##1024 +1024 -4,#0)
+ }
+ {
+  r11:10 += lsr(r7:6,#32)
+  if (!p0) jump .Lmul_abnormal
+  p1 = cmp.eq(r14,#0)
+  p1 = cmp.eq(r6,#0)
+ }
+ {
+  if (!p1) r10 = or(r10,r8)
+  r6 = extractu(r1,#11,#20)
+  r7 = extractu(r3,#11,#20)
+ }
+ {
+  r15:14 = neg(r11:10)
+  r6 += add(r28,r7)
+  r28 = xor(r1,r3)
+ }
+ {
+  if (!p2.new) r11:10 = r15:14
+  p2 = cmp.gt(r28,#-1)
+  p0 = !cmp.gt(r6,r5)
+  p0 = cmp.gt(r6,r4)
+  if (!p0.new) jump:nt .Lmul_ovf_unf
+ }
+ {
+  r1:0 = convert_d2df(r11:10)
+  r6 = add(r6,#-1024 -58)
+ }
+ {
+  r1 += asl(r6,#20)
+  jumpr r31
+ }
+
+ .falign
+.Lpossible_unf1:
+ {
+  p0 = cmp.eq(r0,#0)
+  p0 = bitsclr(r1,r4)
+  if (!p0.new) jumpr:t r31
+  r5 = #0x7fff
+ }
+ {
+  p0 = bitsset(r13,r5)
+  r4 = USR
+  r5 = #0x030
+ }
+ {
+  if (p0) r4 = or(r4,r5)
+ }
+ {
+  USR = r4
+ }
+ {
+  p0 = dfcmp.eq(r1:0,r1:0)
+  jumpr r31
+ }
+ .falign
+.Lmul_ovf_unf:
+ {
+  r1:0 = convert_d2df(r11:10)
+  r13:12 = abs(r11:10)
+  r7 = add(r6,#-1024 -58)
+ }
+ {
+  r1 += asl(r7,#20)
+  r7 = extractu(r1,#11,#20)
+  r4 = ##0x7FEFFFFF
+ }
+ {
+  r7 += add(r6,##-1024 -58)
+
+  r5 = #0
+ }
+ {
+  p0 = cmp.gt(r7,##1024 +1024 -2)
+  if (p0.new) jump:nt .Lmul_ovf
+ }
+ {
+  p0 = cmp.gt(r7,#0)
+  if (p0.new) jump:nt .Lpossible_unf1
+  r5 = sub(r6,r5)
+  r28 = #63
+ }
+ {
+  r4 = #0
+  r5 = sub(#5,r5)
+ }
+ {
+  p3 = cmp.gt(r11,#-1)
+  r5 = min(r5,r28)
+  r11:10 = r13:12
+ }
+ {
+  r28 = USR
+  r15:14 = extractu(r11:10,r5:4)
+ }
+ {
+  r11:10 = asr(r11:10,r5)
+  r4 = #0x0030
+  r1 = insert(r9,#11,#20)
+ }
+ {
+  p0 = cmp.gtu(r9:8,r15:14)
+  if (!p0.new) r10 = or(r10,r8)
+  r11 = setbit(r11,#20 +3)
+ }
+ {
+  r15:14 = neg(r11:10)
+  p1 = bitsclr(r10,#0x7)
+  if (!p1.new) r28 = or(r4,r28)
+ }
+ {
+  if (!p3) r11:10 = r15:14
+  USR = r28
+ }
+ {
+  r1:0 = convert_d2df(r11:10)
+  p0 = dfcmp.eq(r1:0,r1:0)
+ }
+ {
+  r1 = insert(r9,#11 -1,#20 +1)
+  jumpr r31
+ }
+ .falign
+.Lmul_ovf:
+
+ {
+  r28 = USR
+  r13:12 = combine(##0x7fefffff,#-1)
+  r1:0 = r11:10
+ }
+ {
+  r14 = extractu(r28,#2,#22)
+  r28 = or(r28,#0x28)
+  r5:4 = combine(##0x7ff00000,#0)
+ }
+ {
+  USR = r28
+  r14 ^= lsr(r1,#31)
+  r28 = r14
+ }
+ {
+  p0 = !cmp.eq(r28,#1)
+  p0 = !cmp.eq(r14,#2)
+  if (p0.new) r13:12 = r5:4
+  p0 = dfcmp.eq(r1:0,r1:0)
+ }
+ {
+  r1:0 = insert(r13:12,#63,#0)
+  jumpr r31
+ }
+
+.Lmul_abnormal:
+ {
+  r13:12 = extractu(r1:0,#63,#0)
+  r5:4 = extractu(r3:2,#63,#0)
+ }
+ {
+  p3 = cmp.gtu(r13:12,r5:4)
+  if (!p3.new) r1:0 = r3:2
+  if (!p3.new) r3:2 = r1:0
+ }
+ {
+
+  p0 = dfclass(r1:0,#0x0f)
+  if (!p0.new) jump:nt .Linvalid_nan
+  if (!p3) r13:12 = r5:4
+  if (!p3) r5:4 = r13:12
+ }
+ {
+
+  p1 = dfclass(r1:0,#0x08)
+  p1 = dfclass(r3:2,#0x0e)
+ }
+ {
+
+
+  p0 = dfclass(r1:0,#0x08)
+  p0 = dfclass(r3:2,#0x01)
+ }
+ {
+  if (p1) jump .Ltrue_inf
+  p2 = dfclass(r3:2,#0x01)
+ }
+ {
+  if (p0) jump .Linvalid_zeroinf
+  if (p2) jump .Ltrue_zero
+  r28 = ##0x7c000000
+ }
+
+
+
+
+
+ {
+  p0 = bitsclr(r1,r28)
+  if (p0.new) jump:nt .Lmul_tiny
+ }
+ {
+  r28 = cl0(r5:4)
+ }
+ {
+  r28 = add(r28,#-11)
+ }
+ {
+  r5:4 = asl(r5:4,r28)
+ }
+ {
+  r3:2 = insert(r5:4,#63,#0)
+  r1 -= asl(r28,#20)
+ }
+ jump __hexagon_muldf3
+.Lmul_tiny:
+ {
+  r28 = USR
+  r1:0 = xor(r1:0,r3:2)
+ }
+ {
+  r28 = or(r28,#0x30)
+  r1:0 = insert(r9:8,#63,#0)
+  r5 = extractu(r28,#2,#22)
+ }
+ {
+  USR = r28
+  p0 = cmp.gt(r5,#1)
+  if (!p0.new) r0 = #0
+  r5 ^= lsr(r1,#31)
+ }
+ {
+  p0 = cmp.eq(r5,#3)
+  if (!p0.new) r0 = #0
+  jumpr r31
+ }
+.Linvalid_zeroinf:
+ {
+  r28 = USR
+ }
+ {
+  r1:0 = #-1
+  r28 = or(r28,#2)
+ }
+ {
+  USR = r28
+ }
+ {
+  p0 = dfcmp.uo(r1:0,r1:0)
+  jumpr r31
+ }
+.Linvalid_nan:
+ {
+  p0 = dfclass(r3:2,#0x0f)
+  r28 = convert_df2sf(r1:0)
+  if (p0.new) r3:2 = r1:0
+ }
+ {
+  r2 = convert_df2sf(r3:2)
+  r1:0 = #-1
+  jumpr r31
+ }
+ .falign
+.Ltrue_zero:
+ {
+  r1:0 = r3:2
+  r3:2 = r1:0
+ }
+.Ltrue_inf:
+ {
+  r3 = extract(r3,#1,#31)
+ }
+ {
+  r1 ^= asl(r3,#31)
+  jumpr r31
+ }
+.size __hexagon_muldf3,.-__hexagon_muldf3
diff --git a/src/hexagon/dfsqrt.s b/src/hexagon/dfsqrt.s
new file mode 100644
index 00000000..14f584a1
--- /dev/null
+++ b/src/hexagon/dfsqrt.s
@@ -0,0 +1,277 @@
+ .text
+ .global __hexagon_sqrtdf2
+ .type __hexagon_sqrtdf2,@function
+ .global __hexagon_sqrt
+ .type __hexagon_sqrt,@function
+ .global __qdsp_sqrtdf2 ; .set __qdsp_sqrtdf2, __hexagon_sqrtdf2; .type __qdsp_sqrtdf2,@function
+ .global __qdsp_sqrt ; .set __qdsp_sqrt, __hexagon_sqrt; .type __qdsp_sqrt,@function
+ .global __hexagon_fast_sqrtdf2 ; .set __hexagon_fast_sqrtdf2, __hexagon_sqrtdf2; .type __hexagon_fast_sqrtdf2,@function
+ .global __hexagon_fast_sqrt ; .set __hexagon_fast_sqrt, __hexagon_sqrt; .type __hexagon_fast_sqrt,@function
+ .global __hexagon_fast2_sqrtdf2 ; .set __hexagon_fast2_sqrtdf2, __hexagon_sqrtdf2; .type __hexagon_fast2_sqrtdf2,@function
+ .global __hexagon_fast2_sqrt ; .set __hexagon_fast2_sqrt, __hexagon_sqrt; .type __hexagon_fast2_sqrt,@function
+ .type sqrt,@function
+ .p2align 5
+__hexagon_sqrtdf2:
+__hexagon_sqrt:
+ {
+  r15:14 = extractu(r1:0,#23 +1,#52 -23)
+  r28 = extractu(r1,#11,#52 -32)
+  r5:4 = combine(##0x3f000004,#1)
+ }
+ {
+  p2 = dfclass(r1:0,#0x02)
+  p2 = cmp.gt(r1,#-1)
+  if (!p2.new) jump:nt .Lsqrt_abnormal
+  r9 = or(r5,r14)
+ }
+
+.Ldenormal_restart:
+ {
+  r11:10 = r1:0
+  r7,p0 = sfinvsqrta(r9)
+  r5 = and(r5,#-16)
+  r3:2 = #0
+ }
+ {
+  r3 += sfmpy(r7,r9):lib
+  r2 += sfmpy(r7,r5):lib
+  r6 = r5
+
+
+  r9 = and(r28,#1)
+ }
+ {
+  r6 -= sfmpy(r3,r2):lib
+  r11 = insert(r4,#11 +1,#52 -32)
+  p1 = cmp.gtu(r9,#0)
+ }
+ {
+  r3 += sfmpy(r3,r6):lib
+  r2 += sfmpy(r2,r6):lib
+  r6 = r5
+  r9 = mux(p1,#8,#9)
+ }
+ {
+  r6 -= sfmpy(r3,r2):lib
+  r11:10 = asl(r11:10,r9)
+  r9 = mux(p1,#3,#2)
+ }
+ {
+  r2 += sfmpy(r2,r6):lib
+
+  r15:14 = asl(r11:10,r9)
+ }
+ {
+  r2 = and(r2,##0x007fffff)
+ }
+ {
+  r2 = add(r2,##0x00800000 - 3)
+  r9 = mux(p1,#7,#8)
+ }
+ {
+  r8 = asl(r2,r9)
+  r9 = mux(p1,#15-(1+1),#15-(1+0))
+ }
+ {
+  r13:12 = mpyu(r8,r15)
+ }
+ {
+  r1:0 = asl(r11:10,#15)
+  r15:14 = mpyu(r13,r13)
+  p1 = cmp.eq(r0,r0)
+ }
+ {
+  r1:0 -= asl(r15:14,#15)
+  r15:14 = mpyu(r13,r12)
+  p2 = cmp.eq(r0,r0)
+ }
+ {
+  r1:0 -= lsr(r15:14,#16)
+  p3 = cmp.eq(r0,r0)
+ }
+ {
+  r1:0 = mpyu(r1,r8)
+ }
+ {
+  r13:12 += lsr(r1:0,r9)
+  r9 = add(r9,#16)
+  r1:0 = asl(r11:10,#31)
+ }
+
+ {
+  r15:14 = mpyu(r13,r13)
+  r1:0 -= mpyu(r13,r12)
+ }
+ {
+  r1:0 -= asl(r15:14,#31)
+  r15:14 = mpyu(r12,r12)
+ }
+ {
+  r1:0 -= lsr(r15:14,#33)
+ }
+ {
+  r1:0 = mpyu(r1,r8)
+ }
+ {
+  r13:12 += lsr(r1:0,r9)
+  r9 = add(r9,#16)
+  r1:0 = asl(r11:10,#47)
+ }
+
+ {
+  r15:14 = mpyu(r13,r13)
+ }
+ {
+  r1:0 -= asl(r15:14,#47)
+  r15:14 = mpyu(r13,r12)
+ }
+ {
+  r1:0 -= asl(r15:14,#16)
+  r15:14 = mpyu(r12,r12)
+ }
+ {
+  r1:0 -= lsr(r15:14,#17)
+ }
+ {
+  r1:0 = mpyu(r1,r8)
+ }
+ {
+  r13:12 += lsr(r1:0,r9)
+ }
+ {
+  r3:2 = mpyu(r13,r12)
+  r5:4 = mpyu(r12,r12)
+  r15:14 = #0
+  r1:0 = #0
+ }
+ {
+  r3:2 += lsr(r5:4,#33)
+  r5:4 += asl(r3:2,#33)
+  p1 = cmp.eq(r0,r0)
+ }
+ {
+  r7:6 = mpyu(r13,r13)
+  r1:0 = sub(r1:0,r5:4,p1):carry
+  r9:8 = #1
+ }
+ {
+  r7:6 += lsr(r3:2,#31)
+  r9:8 += asl(r13:12,#1)
+ }
+
+
+
+
+
+ {
+  r15:14 = sub(r11:10,r7:6,p1):carry
+  r5:4 = sub(r1:0,r9:8,p2):carry
+
+
+
+
+  r7:6 = #1
+  r11:10 = #0
+ }
+ {
+  r3:2 = sub(r15:14,r11:10,p2):carry
+  r7:6 = add(r13:12,r7:6)
+  r28 = add(r28,#-0x3ff)
+ }
+ {
+
+  if (p2) r13:12 = r7:6
+  if (p2) r1:0 = r5:4
+  if (p2) r15:14 = r3:2
+ }
+ {
+  r5:4 = sub(r1:0,r9:8,p3):carry
+  r7:6 = #1
+  r28 = asr(r28,#1)
+ }
+ {
+  r3:2 = sub(r15:14,r11:10,p3):carry
+  r7:6 = add(r13:12,r7:6)
+ }
+ {
+  if (p3) r13:12 = r7:6
+  if (p3) r1:0 = r5:4
+
+
+
+
+
+  r2 = #1
+ }
+ {
+  p0 = cmp.eq(r1:0,r11:10)
+  if (!p0.new) r12 = or(r12,r2)
+  r3 = cl0(r13:12)
+  r28 = add(r28,#-63)
+ }
+
+
+
+ {
+  r1:0 = convert_ud2df(r13:12)
+  r28 = add(r28,r3)
+ }
+ {
+  r1 += asl(r28,#52 -32)
+  jumpr r31
+ }
+.Lsqrt_abnormal:
+ {
+  p0 = dfclass(r1:0,#0x01)
+  if (p0.new) jumpr:t r31
+ }
+ {
+  p0 = dfclass(r1:0,#0x10)
+  if (p0.new) jump:nt .Lsqrt_nan
+ }
+ {
+  p0 = cmp.gt(r1,#-1)
+  if (!p0.new) jump:nt .Lsqrt_invalid_neg
+  if (!p0.new) r28 = ##0x7F800001
+ }
+ {
+  p0 = dfclass(r1:0,#0x08)
+  if (p0.new) jumpr:nt r31
+ }
+
+
+ {
+  r1:0 = extractu(r1:0,#52,#0)
+ }
+ {
+  r28 = add(clb(r1:0),#-11)
+ }
+ {
+  r1:0 = asl(r1:0,r28)
+  r28 = sub(#1,r28)
+ }
+ {
+  r1 = insert(r28,#1,#52 -32)
+ }
+ {
+  r3:2 = extractu(r1:0,#23 +1,#52 -23)
+  r5 = ##0x3f000004
+ }
+ {
+  r9 = or(r5,r2)
+  r5 = and(r5,#-16)
+  jump .Ldenormal_restart
+ }
+.Lsqrt_nan:
+ {
+  r28 = convert_df2sf(r1:0)
+  r1:0 = #-1
+  jumpr r31
+ }
+.Lsqrt_invalid_neg:
+ {
+  r1:0 = convert_sf2df(r28)
+  jumpr r31
+ }
+.size __hexagon_sqrt,.-__hexagon_sqrt
+.size __hexagon_sqrtdf2,.-__hexagon_sqrtdf2
diff --git a/src/hexagon/divdi3.s b/src/hexagon/divdi3.s
new file mode 100644
index 00000000..0fee6e70
--- /dev/null
+++ b/src/hexagon/divdi3.s
@@ -0,0 +1,64 @@
+
+FUNCTION_BEGIN __hexagon_divdi3
+ {
+  p2 = tstbit(r1,#31)
+  p3 = tstbit(r3,#31)
+ }
+ {
+  r1:0 = abs(r1:0)
+  r3:2 = abs(r3:2)
+ }
+ {
+  r6 = cl0(r1:0)
+  r7 = cl0(r3:2)
+  r5:4 = r3:2
+  r3:2 = r1:0
+ }
+ {
+  p3 = xor(p2,p3)
+  r10 = sub(r7,r6)
+  r1:0 = #0
+  r15:14 = #1
+ }
+ {
+  r11 = add(r10,#1)
+  r13:12 = lsl(r5:4,r10)
+  r15:14 = lsl(r15:14,r10)
+ }
+ {
+  p0 = cmp.gtu(r5:4,r3:2)
+  loop0(1f,r11)
+ }
+ {
+  if (p0) jump .hexagon_divdi3_return
+ }
+ .falign
+1:
+ {
+  p0 = cmp.gtu(r13:12,r3:2)
+ }
+ {
+  r7:6 = sub(r3:2, r13:12)
+  r9:8 = add(r1:0, r15:14)
+ }
+ {
+  r1:0 = vmux(p0, r1:0, r9:8)
+  r3:2 = vmux(p0, r3:2, r7:6)
+ }
+ {
+  r15:14 = lsr(r15:14, #1)
+  r13:12 = lsr(r13:12, #1)
+ }:endloop0
+
+.hexagon_divdi3_return:
+ {
+  r3:2 = neg(r1:0)
+ }
+ {
+  r1:0 = vmux(p3,r3:2,r1:0)
+  jumpr r31
+ }
+FUNCTION_END __hexagon_divdi3
+
+  .globl __qdsp_divdi3
+  .set __qdsp_divdi3, __hexagon_divdi3
diff --git a/src/hexagon/divsi3.s b/src/hexagon/divsi3.s
new file mode 100644
index 00000000..fc957a43
--- /dev/null
+++ b/src/hexagon/divsi3.s
@@ -0,0 +1,53 @@
+
+FUNCTION_BEGIN __hexagon_divsi3
+ {
+  p0 = cmp.ge(r0,#0)
+  p1 = cmp.ge(r1,#0)
+  r1 = abs(r0)
+  r2 = abs(r1)
+ }
+ {
+  r3 = cl0(r1)
+  r4 = cl0(r2)
+  r5 = sub(r1,r2)
+  p2 = cmp.gtu(r2,r1)
+ }
+ {
+  r0 = #0
+  p1 = xor(p0,p1)
+  p0 = cmp.gtu(r2,r5)
+  if (p2) jumpr r31
+ }
+
+ {
+  r0 = mux(p1,#-1,#1)
+  if (p0) jumpr r31
+  r4 = sub(r4,r3)
+  r3 = #1
+ }
+ {
+  r0 = #0
+  r3:2 = vlslw(r3:2,r4)
+  loop0(1f,r4)
+ }
+ .falign
+1:
+ {
+  p0 = cmp.gtu(r2,r1)
+  if (!p0.new) r1 = sub(r1,r2)
+  if (!p0.new) r0 = add(r0,r3)
+  r3:2 = vlsrw(r3:2,#1)
+ }:endloop0
+ {
+  p0 = cmp.gtu(r2,r1)
+  if (!p0.new) r0 = add(r0,r3)
+  if (!p1) jumpr r31
+ }
+ {
+  r0 = neg(r0)
+  jumpr r31
+ }
+FUNCTION_END __hexagon_divsi3
+
+  .globl __qdsp_divsi3
+  .set __qdsp_divsi3, __hexagon_divsi3
diff --git a/src/hexagon/fastmath2_dlib_asm.s b/src/hexagon/fastmath2_dlib_asm.s
new file mode 100644
index 00000000..e77b7db0
--- /dev/null
+++ b/src/hexagon/fastmath2_dlib_asm.s
@@ -0,0 +1,266 @@
+        .text
+        .global __hexagon_fast2_dadd_asm
+        .type __hexagon_fast2_dadd_asm, @function
+__hexagon_fast2_dadd_asm:
+        .falign
+      {
+        R7:6 = VABSDIFFH(R1:0, R3:2)
+        R9 = #62
+        R4 = SXTH(R0)
+        R5 = SXTH(R2)
+      } {
+        R6 = SXTH(R6)
+        P0 = CMP.GT(R4, R5);
+        if ( P0.new) R8 = add(R4, #1)
+        if (!P0.new) R8 = add(R5, #1)
+      } {
+        if ( P0) R4 = #1
+        if (!P0) R5 = #1
+        R0.L = #0
+        R6 = MIN(R6, R9)
+      } {
+        if (!P0) R4 = add(R6, #1)
+        if ( P0) R5 = add(R6, #1)
+        R2.L = #0
+        R11:10 = #0
+      } {
+        R1:0 = ASR(R1:0, R4)
+        R3:2 = ASR(R3:2, R5)
+      } {
+        R1:0 = add(R1:0, R3:2)
+        R10.L = #0x8001
+      } {
+        R4 = clb(R1:0)
+        R9 = #58
+      } {
+        R4 = add(R4, #-1)
+        p0 = cmp.gt(R4, R9)
+      } {
+        R1:0 = ASL(R1:0, R4)
+        R8 = SUB(R8, R4)
+        if(p0) jump .Ldenorma
+      } {
+        R0 = insert(R8, #16, #0)
+        jumpr r31
+      }
+.Ldenorma:
+      {
+        R1:0 = R11:10
+        jumpr r31
+      }
+        .text
+        .global __hexagon_fast2_dsub_asm
+        .type __hexagon_fast2_dsub_asm, @function
+__hexagon_fast2_dsub_asm:
+        .falign
+      {
+        R7:6 = VABSDIFFH(R1:0, R3:2)
+        R9 = #62
+        R4 = SXTH(R0)
+        R5 = SXTH(R2)
+      } {
+        R6 = SXTH(R6)
+        P0 = CMP.GT(R4, R5);
+        if ( P0.new) R8 = add(R4, #1)
+        if (!P0.new) R8 = add(R5, #1)
+      } {
+        if ( P0) R4 = #1
+        if (!P0) R5 = #1
+        R0.L = #0
+        R6 = MIN(R6, R9)
+      } {
+        if (!P0) R4 = add(R6, #1)
+        if ( P0) R5 = add(R6, #1)
+        R2.L = #0
+        R11:10 = #0
+      } {
+        R1:0 = ASR(R1:0, R4)
+        R3:2 = ASR(R3:2, R5)
+      } {
+        R1:0 = sub(R1:0, R3:2)
+        R10.L = #0x8001
+      } {
+        R4 = clb(R1:0)
+        R9 = #58
+      } {
+        R4 = add(R4, #-1)
+        p0 = cmp.gt(R4, R9)
+      } {
+        R1:0 = ASL(R1:0, R4)
+        R8 = SUB(R8, R4)
+        if(p0) jump .Ldenorm
+      } {
+        R0 = insert(R8, #16, #0)
+        jumpr r31
+      }
+.Ldenorm:
+      {
+        R1:0 = R11:10
+        jumpr r31
+      }
+        .text
+        .global __hexagon_fast2_dmpy_asm
+        .type __hexagon_fast2_dmpy_asm, @function
+__hexagon_fast2_dmpy_asm:
+        .falign
+      {
+        R13= lsr(R2, #16)
+        R5 = sxth(R2)
+        R4 = sxth(R0)
+        R12= lsr(R0, #16)
+      }
+      {
+        R11:10 = mpy(R1, R3)
+        R7:6 = mpy(R1, R13)
+        R0.L = #0x0
+        R15:14 = #0
+      }
+      {
+        R11:10 = add(R11:10, R11:10)
+        R7:6 += mpy(R3, R12)
+        R2.L = #0x0
+        R15.H = #0x8000
+      }
+      {
+        R7:6 = asr(R7:6, #15)
+        R12.L = #0x8001
+        p1 = cmp.eq(R1:0, R3:2)
+      }
+      {
+        R7:6 = add(R7:6, R11:10)
+        R8 = add(R4, R5)
+        p2 = cmp.eq(R1:0, R15:14)
+      }
+      {
+        R9 = clb(R7:6)
+        R3:2 = abs(R7:6)
+        R11 = #58
+      }
+      {
+        p1 = and(p1, p2)
+        R8 = sub(R8, R9)
+        R9 = add(R9, #-1)
+ p0 = cmp.gt(R9, R11)
+      }
+      {
+        R8 = add(R8, #1)
+        R1:0 = asl(R7:6, R9)
+        if(p1) jump .Lsat
+      }
+      {
+        R0 = insert(R8,#16, #0)
+        if(!p0) jumpr r31
+      }
+      {
+        R0 = insert(R12,#16, #0)
+        jumpr r31
+      }
+.Lsat:
+      {
+        R1:0 = #-1
+      }
+      {
+        R1:0 = lsr(R1:0, #1)
+      }
+      {
+        R0 = insert(R8,#16, #0)
+        jumpr r31
+      }
+        .text
+        .global __hexagon_fast2_qd2f_asm
+        .type __hexagon_fast2_qd2f_asm, @function
+__hexagon_fast2_qd2f_asm:
+      .falign
+     {
+       R3 = abs(R1):sat
+       R4 = sxth(R0)
+       R5 = #0x40
+       R6.L = #0xffc0
+     }
+     {
+       R0 = extractu(R3, #8, #0)
+       p2 = cmp.gt(R4, #126)
+       p3 = cmp.ge(R4, #-126)
+       R6.H = #0x7fff
+     }
+     {
+       p1 = cmp.eq(R0,#0x40)
+       if(p1.new) R5 = #0
+       R4 = add(R4, #126)
+       if(!p3) jump .Lmin
+     }
+     {
+       p0 = bitsset(R3, R6)
+       R0.L = #0x0000
+       R2 = add(R3, R5)
+       R7 = lsr(R6, #8)
+     }
+     {
+       if(p0) R4 = add(R4, #1)
+       if(p0) R3 = #0
+       R2 = lsr(R2, #7)
+       R0.H = #0x8000
+     }
+     {
+       R0 = and(R0, R1)
+       R6 &= asl(R4, #23)
+       if(!p0) R3 = and(R2, R7)
+       if(p2) jump .Lmax
+     }
+     {
+       R0 += add(R6, R3)
+       jumpr r31
+     }
+.Lmax:
+     {
+       R0.L = #0xffff;
+     }
+     {
+       R0.H = #0x7f7f;
+       jumpr r31
+     }
+.Lmin:
+     {
+       R0 = #0x0
+       jumpr r31
+     }
+        .text
+        .global __hexagon_fast2_f2qd_asm
+        .type __hexagon_fast2_f2qd_asm, @function
+__hexagon_fast2_f2qd_asm:
+
+
+
+
+
+
+
+        .falign
+  {
+       R1 = asl(R0, #7)
+       p0 = tstbit(R0, #31)
+       R5:4 = #0
+       R3 = add(R0,R0)
+  }
+  {
+       R1 = setbit(R1, #30)
+       R0= extractu(R0,#8,#23)
+       R4.L = #0x8001
+       p1 = cmp.eq(R3, #0)
+  }
+  {
+       R1= extractu(R1, #31, #0)
+       R0= add(R0, #-126)
+       R2 = #0
+       if(p1) jump .Lminqd
+  }
+  {
+       R0 = zxth(R0)
+       if(p0) R1= sub(R2, R1)
+       jumpr r31
+  }
+.Lminqd:
+  {
+       R1:0 = R5:4
+       jumpr r31
+  }
diff --git a/src/hexagon/fastmath2_ldlib_asm.s b/src/hexagon/fastmath2_ldlib_asm.s
new file mode 100644
index 00000000..3251057d
--- /dev/null
+++ b/src/hexagon/fastmath2_ldlib_asm.s
@@ -0,0 +1,187 @@
+        .text
+        .global __hexagon_fast2ldadd_asm
+        .type __hexagon_fast2ldadd_asm, @function
+__hexagon_fast2ldadd_asm:
+        .falign
+      {
+        R4 = memw(r29+#8)
+        R5 = memw(r29+#24)
+        r7 = r0
+      }
+      {
+        R6 = sub(R4, R5):sat
+        P0 = CMP.GT(R4, R5);
+        if ( P0.new) R8 = add(R4, #1)
+        if (!P0.new) R8 = add(R5, #1)
+      } {
+        R6 = abs(R6):sat
+        if ( P0) R4 = #1
+        if (!P0) R5 = #1
+        R9 = #62
+      } {
+        R6 = MIN(R6, R9)
+        R1:0 = memd(r29+#0)
+        R3:2 = memd(r29+#16)
+      } {
+        if (!P0) R4 = add(R6, #1)
+        if ( P0) R5 = add(R6, #1)
+      } {
+        R1:0 = ASR(R1:0, R4)
+        R3:2 = ASR(R3:2, R5)
+      } {
+        R1:0 = add(R1:0, R3:2)
+        R3:2 = #0
+      } {
+        R4 = clb(R1:0)
+        R9.L =#0x0001
+      } {
+        R8 -= add(R4, #-1)
+        R4 = add(R4, #-1)
+        p0 = cmp.gt(R4, #58)
+        R9.H =#0x8000
+      } {
+        if(!p0)memw(r7+#8) = R8
+        R1:0 = ASL(R1:0, R4)
+        if(p0) jump .Ldenorma1
+      } {
+        memd(r7+#0) = R1:0
+        jumpr r31
+      }
+.Ldenorma1:
+        memd(r7+#0) = R3:2
+      {
+        memw(r7+#8) = R9
+        jumpr r31
+      }
+        .text
+        .global __hexagon_fast2ldsub_asm
+        .type __hexagon_fast2ldsub_asm, @function
+__hexagon_fast2ldsub_asm:
+        .falign
+      {
+        R4 = memw(r29+#8)
+        R5 = memw(r29+#24)
+        r7 = r0
+      }
+      {
+        R6 = sub(R4, R5):sat
+        P0 = CMP.GT(R4, R5);
+        if ( P0.new) R8 = add(R4, #1)
+        if (!P0.new) R8 = add(R5, #1)
+      } {
+        R6 = abs(R6):sat
+        if ( P0) R4 = #1
+        if (!P0) R5 = #1
+        R9 = #62
+      } {
+        R6 = min(R6, R9)
+        R1:0 = memd(r29+#0)
+        R3:2 = memd(r29+#16)
+      } {
+        if (!P0) R4 = add(R6, #1)
+        if ( P0) R5 = add(R6, #1)
+      } {
+        R1:0 = ASR(R1:0, R4)
+        R3:2 = ASR(R3:2, R5)
+      } {
+        R1:0 = sub(R1:0, R3:2)
+        R3:2 = #0
+      } {
+        R4 = clb(R1:0)
+        R9.L =#0x0001
+      } {
+        R8 -= add(R4, #-1)
+        R4 = add(R4, #-1)
+        p0 = cmp.gt(R4, #58)
+        R9.H =#0x8000
+      } {
+        if(!p0)memw(r7+#8) = R8
+        R1:0 = asl(R1:0, R4)
+        if(p0) jump .Ldenorma_s
+      } {
+        memd(r7+#0) = R1:0
+        jumpr r31
+      }
+.Ldenorma_s:
+        memd(r7+#0) = R3:2
+      {
+        memw(r7+#8) = R9
+        jumpr r31
+      }
+        .text
+        .global __hexagon_fast2ldmpy_asm
+        .type __hexagon_fast2ldmpy_asm, @function
+__hexagon_fast2ldmpy_asm:
+        .falign
+      {
+        R15:14 = memd(r29+#0)
+        R3:2 = memd(r29+#16)
+        R13:12 = #0
+      }
+      {
+        R8= extractu(R2, #31, #1)
+        R9= extractu(R14, #31, #1)
+        R13.H = #0x8000
+      }
+      {
+        R11:10 = mpy(R15, R3)
+        R7:6 = mpy(R15, R8)
+        R4 = memw(r29+#8)
+        R5 = memw(r29+#24)
+      }
+      {
+        R11:10 = add(R11:10, R11:10)
+        R7:6 += mpy(R3, R9)
+      }
+      {
+        R7:6 = asr(R7:6, #30)
+        R8.L = #0x0001
+        p1 = cmp.eq(R15:14, R3:2)
+      }
+      {
+        R7:6 = add(R7:6, R11:10)
+        R4= add(R4, R5)
+        p2 = cmp.eq(R3:2, R13:12)
+      }
+      {
+        R9 = clb(R7:6)
+        R8.H = #0x8000
+        p1 = and(p1, p2)
+      }
+      {
+        R4-= add(R9, #-1)
+        R9 = add(R9, #-1)
+        if(p1) jump .Lsat1
+      }
+      {
+        R7:6 = asl(R7:6, R9)
+        memw(R0+#8) = R4
+ p0 = cmp.gt(R9, #58)
+        if(p0.new) jump:NT .Ldenorm1
+      }
+      {
+        memd(R0+#0) = R7:6
+        jumpr r31
+      }
+.Lsat1:
+      {
+        R13:12 = #0
+        R4+= add(R9, #1)
+      }
+      {
+        R13.H = #0x4000
+        memw(R0+#8) = R4
+      }
+      {
+        memd(R0+#0) = R13:12
+        jumpr r31
+      }
+.Ldenorm1:
+      {
+        memw(R0+#8) = R8
+        R15:14 = #0
+      }
+      {
+        memd(R0+#0) = R15:14
+        jumpr r31
+      }
diff --git a/src/hexagon/func_macro.s b/src/hexagon/func_macro.s
new file mode 100644
index 00000000..9a1e11ae
--- /dev/null
+++ b/src/hexagon/func_macro.s
@@ -0,0 +1,12 @@
+ .macro FUNCTION_BEGIN name
+ .text
+ .p2align 5
+ .globl \name
+ .type \name, @function
+\name:
+ .endm
+
+ .macro FUNCTION_END name
+ .size \name, . - \name
+ .endm
+
diff --git a/src/hexagon/memcpy_forward_vp4cp4n2.s b/src/hexagon/memcpy_forward_vp4cp4n2.s
new file mode 100644
index 00000000..89f69010
--- /dev/null
+++ b/src/hexagon/memcpy_forward_vp4cp4n2.s
@@ -0,0 +1,91 @@
+  .text
+
+
+
+
+
+
+  .globl hexagon_memcpy_forward_vp4cp4n2
+  .balign 32
+  .type hexagon_memcpy_forward_vp4cp4n2,@function
+hexagon_memcpy_forward_vp4cp4n2:
+
+
+
+
+  {
+    r3 = sub(##4096, r1)
+    r5 = lsr(r2, #3)
+  }
+  {
+
+
+    r3 = extractu(r3, #10, #2)
+    r4 = extractu(r3, #7, #5)
+  }
+  {
+    r3 = minu(r2, r3)
+    r4 = minu(r5, r4)
+  }
+  {
+    r4 = or(r4, ##2105344)
+    p0 = cmp.eq(r3, #0)
+    if (p0.new) jump:nt .Lskipprolog
+  }
+    l2fetch(r1, r4)
+  {
+    loop0(.Lprolog, r3)
+    r2 = sub(r2, r3)
+  }
+  .falign
+.Lprolog:
+  {
+    r4 = memw(r1++#4)
+    memw(r0++#4) = r4.new
+  } :endloop0
+.Lskipprolog:
+  {
+
+    r3 = lsr(r2, #10)
+    if (cmp.eq(r3.new, #0)) jump:nt .Lskipmain
+  }
+  {
+    loop1(.Lout, r3)
+    r2 = extractu(r2, #10, #0)
+    r3 = ##2105472
+  }
+
+  .falign
+.Lout:
+
+    l2fetch(r1, r3)
+    loop0(.Lpage, #512)
+  .falign
+.Lpage:
+    r5:4 = memd(r1++#8)
+  {
+    memw(r0++#8) = r4
+    memw(r0+#4) = r5
+  } :endloop0:endloop1
+.Lskipmain:
+  {
+    r3 = ##2105344
+    r4 = lsr(r2, #3)
+    p0 = cmp.eq(r2, #0)
+    if (p0.new) jumpr:nt r31
+  }
+  {
+    r3 = or(r3, r4)
+    loop0(.Lepilog, r2)
+  }
+    l2fetch(r1, r3)
+  .falign
+.Lepilog:
+  {
+    r4 = memw(r1++#4)
+    memw(r0++#4) = r4.new
+  } :endloop0
+
+    jumpr r31
+
+.size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2
diff --git a/src/hexagon/memcpy_likely_aligned.s b/src/hexagon/memcpy_likely_aligned.s
new file mode 100644
index 00000000..7e9b62f6
--- /dev/null
+++ b/src/hexagon/memcpy_likely_aligned.s
@@ -0,0 +1,42 @@
+
+FUNCTION_BEGIN __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes
+ {
+  p0 = bitsclr(r1,#7)
+  p0 = bitsclr(r0,#7)
+  if (p0.new) r5:4 = memd(r1)
+  r3 = #-3
+ }
+ {
+  if (!p0) jump .Lmemcpy_call
+  if (p0) memd(r0++#8) = r5:4
+  if (p0) r5:4 = memd(r1+#8)
+  r3 += lsr(r2,#3)
+ }
+ {
+  memd(r0++#8) = r5:4
+  r5:4 = memd(r1+#16)
+  r1 = add(r1,#24)
+  loop0(1f,r3)
+ }
+ .falign
+1:
+ {
+  memd(r0++#8) = r5:4
+  r5:4 = memd(r1++#8)
+ }:endloop0
+ {
+  memd(r0) = r5:4
+  r0 -= add(r2,#-8)
+  jumpr r31
+ }
+FUNCTION_END __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes
+
+.Lmemcpy_call:
+
+ jump memcpy@PLT
+
+
+
+
+  .globl __qdsp_memcpy_likely_aligned_min32bytes_mult8bytes
+  .set __qdsp_memcpy_likely_aligned_min32bytes_mult8bytes, __hexagon_memcpy_likely_aligned_min32bytes_mult8bytes
diff --git a/src/hexagon/moddi3.s b/src/hexagon/moddi3.s
new file mode 100644
index 00000000..53ea6d52
--- /dev/null
+++ b/src/hexagon/moddi3.s
@@ -0,0 +1,63 @@
+
+
+FUNCTION_BEGIN __hexagon_moddi3
+ {
+  p3 = tstbit(r1,#31)
+ }
+ {
+  r1:0 = abs(r1:0)
+  r3:2 = abs(r3:2)
+ }
+ {
+  r6 = cl0(r1:0)
+  r7 = cl0(r3:2)
+  r5:4 = r3:2
+  r3:2 = r1:0
+ }
+ {
+  r10 = sub(r7,r6)
+  r1:0 = #0
+  r15:14 = #1
+ }
+ {
+  r11 = add(r10,#1)
+  r13:12 = lsl(r5:4,r10)
+  r15:14 = lsl(r15:14,r10)
+ }
+ {
+  p0 = cmp.gtu(r5:4,r3:2)
+  loop0(1f,r11)
+ }
+ {
+  if (p0) jump .hexagon_moddi3_return
+ }
+ .falign
+1:
+ {
+  p0 = cmp.gtu(r13:12,r3:2)
+ }
+ {
+  r7:6 = sub(r3:2, r13:12)
+  r9:8 = add(r1:0, r15:14)
+ }
+ {
+  r1:0 = vmux(p0, r1:0, r9:8)
+  r3:2 = vmux(p0, r3:2, r7:6)
+ }
+ {
+  r15:14 = lsr(r15:14, #1)
+  r13:12 = lsr(r13:12, #1)
+ }:endloop0
+
+.hexagon_moddi3_return:
+ {
+  r1:0 = neg(r3:2)
+ }
+ {
+  r1:0 = vmux(p3,r1:0,r3:2)
+  jumpr r31
+ }
+FUNCTION_END __hexagon_moddi3
+
+  .globl __qdsp_moddi3
+  .set __qdsp_moddi3, __hexagon_moddi3
diff --git a/src/hexagon/modsi3.s b/src/hexagon/modsi3.s
new file mode 100644
index 00000000..c4ae7e59
--- /dev/null
+++ b/src/hexagon/modsi3.s
@@ -0,0 +1,44 @@
+
+
+FUNCTION_BEGIN __hexagon_modsi3
+ {
+  p2 = cmp.ge(r0,#0)
+  r2 = abs(r0)
+  r1 = abs(r1)
+ }
+ {
+  r3 = cl0(r2)
+  r4 = cl0(r1)
+  p0 = cmp.gtu(r1,r2)
+ }
+ {
+  r3 = sub(r4,r3)
+  if (p0) jumpr r31
+ }
+ {
+  p1 = cmp.eq(r3,#0)
+  loop0(1f,r3)
+  r0 = r2
+  r2 = lsl(r1,r3)
+ }
+ .falign
+1:
+ {
+  p0 = cmp.gtu(r2,r0)
+  if (!p0.new) r0 = sub(r0,r2)
+  r2 = lsr(r2,#1)
+  if (p1) r1 = #0
+ }:endloop0
+ {
+  p0 = cmp.gtu(r2,r0)
+  if (!p0.new) r0 = sub(r0,r1)
+  if (p2) jumpr r31
+ }
+ {
+  r0 = neg(r0)
+  jumpr r31
+ }
+FUNCTION_END __hexagon_modsi3
+
+  .globl __qdsp_modsi3
+  .set __qdsp_modsi3, __hexagon_modsi3
diff --git a/src/hexagon/sfdiv_opt.s b/src/hexagon/sfdiv_opt.s
new file mode 100644
index 00000000..26c91f15
--- /dev/null
+++ b/src/hexagon/sfdiv_opt.s
@@ -0,0 +1,42 @@
+
+FUNCTION_BEGIN __hexagon_divsf3
+  {
+    r2,p0 = sfrecipa(r0,r1)
+    r4 = sffixupd(r0,r1)
+    r3 = ##0x3f800000
+  }
+  {
+    r5 = sffixupn(r0,r1)
+    r3 -= sfmpy(r4,r2):lib
+    r6 = ##0x80000000
+    r7 = r3
+  }
+  {
+    r2 += sfmpy(r3,r2):lib
+    r3 = r7
+    r6 = r5
+    r0 = and(r6,r5)
+  }
+  {
+    r3 -= sfmpy(r4,r2):lib
+    r0 += sfmpy(r5,r2):lib
+  }
+  {
+    r2 += sfmpy(r3,r2):lib
+    r6 -= sfmpy(r0,r4):lib
+  }
+  {
+    r0 += sfmpy(r6,r2):lib
+  }
+  {
+    r5 -= sfmpy(r0,r4):lib
+  }
+  {
+    r0 += sfmpy(r5,r2,p0):scale
+    jumpr r31
+  }
+FUNCTION_END __hexagon_divsf3
+
+.global __qdsp_divsf3 ; .set __qdsp_divsf3, __hexagon_divsf3
+.global __hexagon_fast_divsf3 ; .set __hexagon_fast_divsf3, __hexagon_divsf3
+.global __hexagon_fast2_divsf3 ; .set __hexagon_fast2_divsf3, __hexagon_divsf3
diff --git a/src/hexagon/sfsqrt_opt.s b/src/hexagon/sfsqrt_opt.s
new file mode 100644
index 00000000..c90af179
--- /dev/null
+++ b/src/hexagon/sfsqrt_opt.s
@@ -0,0 +1,49 @@
+FUNCTION_BEGIN __hexagon_sqrtf
+  {
+    r3,p0 = sfinvsqrta(r0)
+    r5 = sffixupr(r0)
+    r4 = ##0x3f000000
+    r1:0 = combine(#0,#0)
+  }
+  {
+    r0 += sfmpy(r3,r5):lib
+    r1 += sfmpy(r3,r4):lib
+    r2 = r4
+    r3 = r5
+  }
+  {
+    r2 -= sfmpy(r0,r1):lib
+    p1 = sfclass(r5,#1)
+
+  }
+  {
+    r0 += sfmpy(r0,r2):lib
+    r1 += sfmpy(r1,r2):lib
+    r2 = r4
+    r3 = r5
+  }
+  {
+    r2 -= sfmpy(r0,r1):lib
+    r3 -= sfmpy(r0,r0):lib
+  }
+  {
+    r0 += sfmpy(r1,r3):lib
+    r1 += sfmpy(r1,r2):lib
+    r2 = r4
+    r3 = r5
+  }
+  {
+
+    r3 -= sfmpy(r0,r0):lib
+    if (p1) r0 = or(r0,r5)
+  }
+  {
+    r0 += sfmpy(r1,r3,p0):scale
+    jumpr r31
+  }
+
+FUNCTION_END __hexagon_sqrtf
+
+.global __qdsp_sqrtf ; .set __qdsp_sqrtf, __hexagon_sqrtf
+.global __hexagon_fast_sqrtf ; .set __hexagon_fast_sqrtf, __hexagon_sqrtf
+.global __hexagon_fast2_sqrtf ; .set __hexagon_fast2_sqrtf, __hexagon_sqrtf
diff --git a/src/hexagon/udivdi3.s b/src/hexagon/udivdi3.s
new file mode 100644
index 00000000..f0fffc23
--- /dev/null
+++ b/src/hexagon/udivdi3.s
@@ -0,0 +1,50 @@
+
+
+FUNCTION_BEGIN __hexagon_udivdi3
+ {
+  r6 = cl0(r1:0)
+  r7 = cl0(r3:2)
+  r5:4 = r3:2
+  r3:2 = r1:0
+ }
+ {
+  r10 = sub(r7,r6)
+  r1:0 = #0
+  r15:14 = #1
+ }
+ {
+  r11 = add(r10,#1)
+  r13:12 = lsl(r5:4,r10)
+  r15:14 = lsl(r15:14,r10)
+ }
+ {
+  p0 = cmp.gtu(r5:4,r3:2)
+  loop0(1f,r11)
+ }
+ {
+  if (p0) jumpr r31
+ }
+ .falign
+1:
+ {
+  p0 = cmp.gtu(r13:12,r3:2)
+ }
+ {
+  r7:6 = sub(r3:2, r13:12)
+  r9:8 = add(r1:0, r15:14)
+ }
+ {
+  r1:0 = vmux(p0, r1:0, r9:8)
+  r3:2 = vmux(p0, r3:2, r7:6)
+ }
+ {
+  r15:14 = lsr(r15:14, #1)
+  r13:12 = lsr(r13:12, #1)
+ }:endloop0
+ {
+  jumpr r31
+ }
+FUNCTION_END __hexagon_udivdi3
+
+  .globl __qdsp_udivdi3
+  .set __qdsp_udivdi3, __hexagon_udivdi3
diff --git a/src/hexagon/udivmoddi4.s b/src/hexagon/udivmoddi4.s
new file mode 100644
index 00000000..cbfb3987
--- /dev/null
+++ b/src/hexagon/udivmoddi4.s
@@ -0,0 +1,50 @@
+
+
+FUNCTION_BEGIN __hexagon_udivmoddi4
+ {
+  r6 = cl0(r1:0)
+  r7 = cl0(r3:2)
+  r5:4 = r3:2
+  r3:2 = r1:0
+ }
+ {
+  r10 = sub(r7,r6)
+  r1:0 = #0
+  r15:14 = #1
+ }
+ {
+  r11 = add(r10,#1)
+  r13:12 = lsl(r5:4,r10)
+  r15:14 = lsl(r15:14,r10)
+ }
+ {
+  p0 = cmp.gtu(r5:4,r3:2)
+  loop0(1f,r11)
+ }
+ {
+  if (p0) jumpr r31
+ }
+ .falign
+1:
+ {
+  p0 = cmp.gtu(r13:12,r3:2)
+ }
+ {
+  r7:6 = sub(r3:2, r13:12)
+  r9:8 = add(r1:0, r15:14)
+ }
+ {
+  r1:0 = vmux(p0, r1:0, r9:8)
+  r3:2 = vmux(p0, r3:2, r7:6)
+ }
+ {
+  r15:14 = lsr(r15:14, #1)
+  r13:12 = lsr(r13:12, #1)
+ }:endloop0
+ {
+  jumpr r31
+ }
+FUNCTION_END __hexagon_udivmoddi4
+
+  .globl __qdsp_udivmoddi4
+  .set __qdsp_udivmoddi4, __hexagon_udivmoddi4
diff --git a/src/hexagon/udivmodsi4.s b/src/hexagon/udivmodsi4.s
new file mode 100644
index 00000000..83489c51
--- /dev/null
+++ b/src/hexagon/udivmodsi4.s
@@ -0,0 +1,39 @@
+
+
+FUNCTION_BEGIN __hexagon_udivmodsi4
+ {
+  r2 = cl0(r0)
+  r3 = cl0(r1)
+  r5:4 = combine(#1,#0)
+  p0 = cmp.gtu(r1,r0)
+ }
+ {
+  r6 = sub(r3,r2)
+  r4 = r1
+  r1:0 = combine(r0,r4)
+  if (p0) jumpr r31
+ }
+ {
+  r3:2 = vlslw(r5:4,r6)
+  loop0(1f,r6)
+  p0 = cmp.eq(r6,#0)
+  if (p0.new) r4 = #0
+ }
+ .falign
+1:
+ {
+  p0 = cmp.gtu(r2,r1)
+  if (!p0.new) r1 = sub(r1,r2)
+  if (!p0.new) r0 = add(r0,r3)
+  r3:2 = vlsrw(r3:2,#1)
+ }:endloop0
+ {
+  p0 = cmp.gtu(r2,r1)
+  if (!p0.new) r1 = sub(r1,r4)
+  if (!p0.new) r0 = add(r0,r3)
+  jumpr r31
+ }
+FUNCTION_END __hexagon_udivmodsi4
+
+  .globl __qdsp_udivmodsi4
+  .set __qdsp_udivmodsi4, __hexagon_udivmodsi4
diff --git a/src/hexagon/udivsi3.s b/src/hexagon/udivsi3.s
new file mode 100644
index 00000000..e0b94aa9
--- /dev/null
+++ b/src/hexagon/udivsi3.s
@@ -0,0 +1,36 @@
+
+
+FUNCTION_BEGIN __hexagon_udivsi3
+ {
+  r2 = cl0(r0)
+  r3 = cl0(r1)
+  r5:4 = combine(#1,#0)
+  p0 = cmp.gtu(r1,r0)
+ }
+ {
+  r6 = sub(r3,r2)
+  r4 = r1
+  r1:0 = combine(r0,r4)
+  if (p0) jumpr r31
+ }
+ {
+  r3:2 = vlslw(r5:4,r6)
+  loop0(1f,r6)
+ }
+ .falign
+1:
+ {
+  p0 = cmp.gtu(r2,r1)
+  if (!p0.new) r1 = sub(r1,r2)
+  if (!p0.new) r0 = add(r0,r3)
+  r3:2 = vlsrw(r3:2,#1)
+ }:endloop0
+ {
+  p0 = cmp.gtu(r2,r1)
+  if (!p0.new) r0 = add(r0,r3)
+  jumpr r31
+ }
+FUNCTION_END __hexagon_udivsi3
+
+  .globl __qdsp_udivsi3
+  .set __qdsp_udivsi3, __hexagon_udivsi3
diff --git a/src/hexagon/umoddi3.s b/src/hexagon/umoddi3.s
new file mode 100644
index 00000000..c76011c3
--- /dev/null
+++ b/src/hexagon/umoddi3.s
@@ -0,0 +1,53 @@
+
+
+FUNCTION_BEGIN __hexagon_umoddi3
+ {
+  r6 = cl0(r1:0)
+  r7 = cl0(r3:2)
+  r5:4 = r3:2
+  r3:2 = r1:0
+ }
+ {
+  r10 = sub(r7,r6)
+  r1:0 = #0
+  r15:14 = #1
+ }
+ {
+  r11 = add(r10,#1)
+  r13:12 = lsl(r5:4,r10)
+  r15:14 = lsl(r15:14,r10)
+ }
+ {
+  p0 = cmp.gtu(r5:4,r3:2)
+  loop0(1f,r11)
+ }
+ {
+  if (p0) jump .hexagon_umoddi3_return
+ }
+ .falign
+1:
+ {
+  p0 = cmp.gtu(r13:12,r3:2)
+ }
+ {
+  r7:6 = sub(r3:2, r13:12)
+  r9:8 = add(r1:0, r15:14)
+ }
+ {
+  r1:0 = vmux(p0, r1:0, r9:8)
+  r3:2 = vmux(p0, r3:2, r7:6)
+ }
+ {
+  r15:14 = lsr(r15:14, #1)
+  r13:12 = lsr(r13:12, #1)
+ }:endloop0
+
+.hexagon_umoddi3_return:
+ {
+  r1:0 = r3:2
+  jumpr r31
+ }
+FUNCTION_END __hexagon_umoddi3
+
+  .globl __qdsp_umoddi3
+  .set __qdsp_umoddi3, __hexagon_umoddi3
diff --git a/src/hexagon/umodsi3.s b/src/hexagon/umodsi3.s
new file mode 100644
index 00000000..1b592a7c
--- /dev/null
+++ b/src/hexagon/umodsi3.s
@@ -0,0 +1,34 @@
+
+
+FUNCTION_BEGIN __hexagon_umodsi3
+ {
+  r2 = cl0(r0)
+  r3 = cl0(r1)
+  p0 = cmp.gtu(r1,r0)
+ }
+ {
+  r2 = sub(r3,r2)
+  if (p0) jumpr r31
+ }
+ {
+  loop0(1f,r2)
+  p1 = cmp.eq(r2,#0)
+  r2 = lsl(r1,r2)
+ }
+ .falign
+1:
+ {
+  p0 = cmp.gtu(r2,r0)
+  if (!p0.new) r0 = sub(r0,r2)
+  r2 = lsr(r2,#1)
+  if (p1) r1 = #0
+ }:endloop0
+ {
+  p0 = cmp.gtu(r2,r0)
+  if (!p0.new) r0 = sub(r0,r1)
+  jumpr r31
+ }
+FUNCTION_END __hexagon_umodsi3
+
+  .globl __qdsp_umodsi3
+  .set __qdsp_umodsi3, __hexagon_umodsi3
diff --git a/src/int/addsub.rs b/src/int/addsub.rs
index f4841e90..e95590d8 100644
--- a/src/int/addsub.rs
+++ b/src/int/addsub.rs
@@ -1,6 +1,6 @@
-use int::{DInt, Int};
+use crate::int::{DInt, Int, MinInt};
 
-trait UAddSub: DInt {
+trait UAddSub: DInt + Int {
     fn uadd(self, other: Self) -> Self {
         let (lo, carry) = self.lo().overflowing_add(other.lo());
         let hi = self.hi().wrapping_add(other.hi());
@@ -22,7 +22,7 @@ impl UAddSub for u128 {}
 
 trait AddSub: Int
 where
-    <Self as Int>::UnsignedInt: UAddSub,
+    <Self as MinInt>::UnsignedInt: UAddSub,
 {
     fn add(self, other: Self) -> Self {
         Self::from_unsigned(self.unsigned().uadd(other.unsigned()))
@@ -37,7 +37,7 @@ impl AddSub for i128 {}
 
 trait Addo: AddSub
 where
-    <Self as Int>::UnsignedInt: UAddSub,
+    <Self as MinInt>::UnsignedInt: UAddSub,
 {
     fn addo(self, other: Self) -> (Self, bool) {
         let sum = AddSub::add(self, other);
@@ -50,7 +50,7 @@ impl Addo for u128 {}
 
 trait Subo: AddSub
 where
-    <Self as Int>::UnsignedInt: UAddSub,
+    <Self as MinInt>::UnsignedInt: UAddSub,
 {
     fn subo(self, other: Self) -> (Self, bool) {
         let sum = AddSub::sub(self, other);
diff --git a/src/int/big.rs b/src/int/big.rs
new file mode 100644
index 00000000..0ef3caae
--- /dev/null
+++ b/src/int/big.rs
@@ -0,0 +1,294 @@
+//! Integers used for wide operations, larger than `u128`.
+
+#![allow(unused)]
+
+use crate::int::{DInt, HInt, Int, MinInt};
+use core::{fmt, ops};
+
+const WORD_LO_MASK: u64 = 0x00000000ffffffff;
+const WORD_HI_MASK: u64 = 0xffffffff00000000;
+const WORD_FULL_MASK: u64 = 0xffffffffffffffff;
+const U128_LO_MASK: u128 = u64::MAX as u128;
+const U128_HI_MASK: u128 = (u64::MAX as u128) << 64;
+
+/// A 256-bit unsigned integer represented as 4 64-bit limbs.
+///
+/// Each limb is a native-endian number, but the array is little-limb-endian.
+#[allow(non_camel_case_types)]
+#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
+pub struct u256(pub [u64; 4]);
+
+impl u256 {
+    pub const MAX: Self = Self([u64::MAX, u64::MAX, u64::MAX, u64::MAX]);
+
+    /// Reinterpret as a signed integer
+    pub fn signed(self) -> i256 {
+        i256(self.0)
+    }
+}
+
+/// A 256-bit signed integer represented as 4 64-bit limbs.
+///
+/// Each limb is a native-endian number, but the array is little-limb-endian.
+#[allow(non_camel_case_types)]
+#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
+pub struct i256(pub [u64; 4]);
+
+impl i256 {
+    /// Reinterpret as an unsigned integer
+    pub fn unsigned(self) -> u256 {
+        u256(self.0)
+    }
+}
+
+impl MinInt for u256 {
+    type OtherSign = i256;
+
+    type UnsignedInt = u256;
+
+    const SIGNED: bool = false;
+    const BITS: u32 = 256;
+    const ZERO: Self = Self([0u64; 4]);
+    const ONE: Self = Self([1, 0, 0, 0]);
+    const MIN: Self = Self([0u64; 4]);
+    const MAX: Self = Self([u64::MAX; 4]);
+}
+
+impl MinInt for i256 {
+    type OtherSign = u256;
+
+    type UnsignedInt = u256;
+
+    const SIGNED: bool = false;
+    const BITS: u32 = 256;
+    const ZERO: Self = Self([0u64; 4]);
+    const ONE: Self = Self([1, 0, 0, 0]);
+    const MIN: Self = Self([0, 0, 0, 1 << 63]);
+    const MAX: Self = Self([u64::MAX, u64::MAX, u64::MAX, u64::MAX << 1]);
+}
+
+macro_rules! impl_common {
+    ($ty:ty) => {
+        impl ops::BitOr for $ty {
+            type Output = Self;
+
+            fn bitor(mut self, rhs: Self) -> Self::Output {
+                self.0[0] |= rhs.0[0];
+                self.0[1] |= rhs.0[1];
+                self.0[2] |= rhs.0[2];
+                self.0[3] |= rhs.0[3];
+                self
+            }
+        }
+
+        impl ops::Not for $ty {
+            type Output = Self;
+
+            fn not(self) -> Self::Output {
+                Self([!self.0[0], !self.0[1], !self.0[2], !self.0[3]])
+            }
+        }
+
+        impl ops::Shl<u32> for $ty {
+            type Output = Self;
+
+            fn shl(self, rhs: u32) -> Self::Output {
+                unimplemented!("only used to meet trait bounds")
+            }
+        }
+    };
+}
+
+impl_common!(i256);
+impl_common!(u256);
+
+impl ops::Shr<u32> for u256 {
+    type Output = Self;
+
+    fn shr(self, rhs: u32) -> Self::Output {
+        assert!(rhs < Self::BITS, "attempted to shift right with overflow");
+
+        if rhs == 0 {
+            return self;
+        }
+
+        let mut ret = self;
+        let byte_shift = rhs / 64;
+        let bit_shift = rhs % 64;
+
+        for idx in 0..4 {
+            let base_idx = idx + byte_shift as usize;
+
+            let Some(base) = ret.0.get(base_idx) else {
+                ret.0[idx] = 0;
+                continue;
+            };
+
+            let mut new_val = base >> bit_shift;
+
+            if let Some(new) = ret.0.get(base_idx + 1) {
+                new_val |= new.overflowing_shl(64 - bit_shift).0;
+            }
+
+            ret.0[idx] = new_val;
+        }
+
+        ret
+    }
+}
+
+macro_rules! word {
+    (1, $val:expr) => {
+        (($val >> (32 * 3)) & Self::from(WORD_LO_MASK)) as u64
+    };
+    (2, $val:expr) => {
+        (($val >> (32 * 2)) & Self::from(WORD_LO_MASK)) as u64
+    };
+    (3, $val:expr) => {
+        (($val >> (32 * 1)) & Self::from(WORD_LO_MASK)) as u64
+    };
+    (4, $val:expr) => {
+        (($val >> (32 * 0)) & Self::from(WORD_LO_MASK)) as u64
+    };
+}
+
+impl HInt for u128 {
+    type D = u256;
+
+    fn widen(self) -> Self::D {
+        let w0 = self & u128::from(u64::MAX);
+        let w1 = (self >> u64::BITS) & u128::from(u64::MAX);
+        u256([w0 as u64, w1 as u64, 0, 0])
+    }
+
+    fn zero_widen(self) -> Self::D {
+        self.widen()
+    }
+
+    fn zero_widen_mul(self, rhs: Self) -> Self::D {
+        let product11: u64 = word!(1, self) * word!(1, rhs);
+        let product12: u64 = word!(1, self) * word!(2, rhs);
+        let product13: u64 = word!(1, self) * word!(3, rhs);
+        let product14: u64 = word!(1, self) * word!(4, rhs);
+        let product21: u64 = word!(2, self) * word!(1, rhs);
+        let product22: u64 = word!(2, self) * word!(2, rhs);
+        let product23: u64 = word!(2, self) * word!(3, rhs);
+        let product24: u64 = word!(2, self) * word!(4, rhs);
+        let product31: u64 = word!(3, self) * word!(1, rhs);
+        let product32: u64 = word!(3, self) * word!(2, rhs);
+        let product33: u64 = word!(3, self) * word!(3, rhs);
+        let product34: u64 = word!(3, self) * word!(4, rhs);
+        let product41: u64 = word!(4, self) * word!(1, rhs);
+        let product42: u64 = word!(4, self) * word!(2, rhs);
+        let product43: u64 = word!(4, self) * word!(3, rhs);
+        let product44: u64 = word!(4, self) * word!(4, rhs);
+
+        let sum0: u128 = u128::from(product44);
+        let sum1: u128 = u128::from(product34) + u128::from(product43);
+        let sum2: u128 = u128::from(product24) + u128::from(product33) + u128::from(product42);
+        let sum3: u128 = u128::from(product14)
+            + u128::from(product23)
+            + u128::from(product32)
+            + u128::from(product41);
+        let sum4: u128 = u128::from(product13) + u128::from(product22) + u128::from(product31);
+        let sum5: u128 = u128::from(product12) + u128::from(product21);
+        let sum6: u128 = u128::from(product11);
+
+        let r0: u128 =
+            (sum0 & u128::from(WORD_FULL_MASK)) + ((sum1 & u128::from(WORD_LO_MASK)) << 32);
+        let r1: u128 = (sum0 >> 64)
+            + ((sum1 >> 32) & u128::from(WORD_FULL_MASK))
+            + (sum2 & u128::from(WORD_FULL_MASK))
+            + ((sum3 << 32) & u128::from(WORD_HI_MASK));
+
+        let (lo, carry) = r0.overflowing_add(r1 << 64);
+        let hi = (r1 >> 64)
+            + (sum1 >> 96)
+            + (sum2 >> 64)
+            + (sum3 >> 32)
+            + sum4
+            + (sum5 << 32)
+            + (sum6 << 64)
+            + u128::from(carry);
+
+        u256([
+            (lo & U128_LO_MASK) as u64,
+            ((lo >> 64) & U128_LO_MASK) as u64,
+            (hi & U128_LO_MASK) as u64,
+            ((hi >> 64) & U128_LO_MASK) as u64,
+        ])
+    }
+
+    fn widen_mul(self, rhs: Self) -> Self::D {
+        self.zero_widen_mul(rhs)
+    }
+
+    fn widen_hi(self) -> Self::D {
+        self.widen() << <Self as MinInt>::BITS
+    }
+}
+
+impl HInt for i128 {
+    type D = i256;
+
+    fn widen(self) -> Self::D {
+        let mut ret = self.unsigned().zero_widen().signed();
+        if self.is_negative() {
+            ret.0[2] = u64::MAX;
+            ret.0[3] = u64::MAX;
+        }
+        ret
+    }
+
+    fn zero_widen(self) -> Self::D {
+        self.unsigned().zero_widen().signed()
+    }
+
+    fn zero_widen_mul(self, rhs: Self) -> Self::D {
+        self.unsigned().zero_widen_mul(rhs.unsigned()).signed()
+    }
+
+    fn widen_mul(self, rhs: Self) -> Self::D {
+        unimplemented!("signed i128 widening multiply is not used")
+    }
+
+    fn widen_hi(self) -> Self::D {
+        self.widen() << <Self as MinInt>::BITS
+    }
+}
+
+impl DInt for u256 {
+    type H = u128;
+
+    fn lo(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[0].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[1].to_le_bytes());
+        u128::from_le_bytes(tmp)
+    }
+
+    fn hi(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[2].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[3].to_le_bytes());
+        u128::from_le_bytes(tmp)
+    }
+}
+
+impl DInt for i256 {
+    type H = i128;
+
+    fn lo(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[0].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[1].to_le_bytes());
+        i128::from_le_bytes(tmp)
+    }
+
+    fn hi(self) -> Self::H {
+        let mut tmp = [0u8; 16];
+        tmp[..8].copy_from_slice(&self.0[2].to_le_bytes());
+        tmp[8..].copy_from_slice(&self.0[3].to_le_bytes());
+        i128::from_le_bytes(tmp)
+    }
+}
diff --git a/src/int/bswap.rs b/src/int/bswap.rs
new file mode 100644
index 00000000..9df80204
--- /dev/null
+++ b/src/int/bswap.rs
@@ -0,0 +1,22 @@
+intrinsics! {
+    #[maybe_use_optimized_c_shim]
+    #[avr_skip]
+    /// Swaps bytes in 32-bit number
+    pub extern "C" fn __bswapsi2(x: u32) -> u32 {
+        x.swap_bytes()
+    }
+
+    #[maybe_use_optimized_c_shim]
+    #[avr_skip]
+    /// Swaps bytes in 64-bit number
+    pub extern "C" fn __bswapdi2(x: u64) -> u64 {
+        x.swap_bytes()
+    }
+
+    #[maybe_use_optimized_c_shim]
+    #[avr_skip]
+    /// Swaps bytes in 128-bit number
+    pub extern "C" fn __bswapti2(x: u128) -> u128 {
+        x.swap_bytes()
+    }
+}
diff --git a/src/int/leading_zeros.rs b/src/int/leading_zeros.rs
index 9e60ab0d..1fee9fcf 100644
--- a/src/int/leading_zeros.rs
+++ b/src/int/leading_zeros.rs
@@ -3,10 +3,12 @@
 // adding a zero check at the beginning, but `__clzsi2` has a precondition that `x != 0`.
 // Compilers will insert the check for zero in cases where it is needed.
 
+use crate::int::{CastInto, Int};
+
 public_test_dep! {
 /// Returns the number of leading binary zeros in `x`.
 #[allow(dead_code)]
-pub(crate) fn usize_leading_zeros_default(x: usize) -> usize {
+pub(crate) fn leading_zeros_default<T: Int + CastInto<usize>>(x: T) -> usize {
     // The basic idea is to test if the higher bits of `x` are zero and bisect the number
     // of leading zeros. It is possible for all branches of the bisection to use the same
     // code path by conditionally shifting the higher parts down to let the next bisection
@@ -16,46 +18,47 @@ pub(crate) fn usize_leading_zeros_default(x: usize) -> usize {
     // because it simplifies the final bisection step.
     let mut x = x;
     // the number of potential leading zeros
-    let mut z = usize::MAX.count_ones() as usize;
+    let mut z = T::BITS as usize;
     // a temporary
-    let mut t: usize;
-    #[cfg(target_pointer_width = "64")]
-    {
+    let mut t: T;
+
+    const { assert!(T::BITS <= 64) };
+    if T::BITS >= 64 {
         t = x >> 32;
-        if t != 0 {
+        if t != T::ZERO {
             z -= 32;
             x = t;
         }
     }
-    #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
-    {
+    if T::BITS >= 32 {
         t = x >> 16;
-        if t != 0 {
+        if t != T::ZERO {
             z -= 16;
             x = t;
         }
     }
+    const { assert!(T::BITS >= 16) };
     t = x >> 8;
-    if t != 0 {
+    if t != T::ZERO {
         z -= 8;
         x = t;
     }
     t = x >> 4;
-    if t != 0 {
+    if t != T::ZERO {
         z -= 4;
         x = t;
     }
     t = x >> 2;
-    if t != 0 {
+    if t != T::ZERO {
         z -= 2;
         x = t;
     }
     // the last two bisections are combined into one conditional
     t = x >> 1;
-    if t != 0 {
+    if t != T::ZERO {
         z - 2
     } else {
-        z - x
+        z - x.cast()
     }
 
     // We could potentially save a few cycles by using the LUT trick from
@@ -80,12 +83,12 @@ pub(crate) fn usize_leading_zeros_default(x: usize) -> usize {
 public_test_dep! {
 /// Returns the number of leading binary zeros in `x`.
 #[allow(dead_code)]
-pub(crate) fn usize_leading_zeros_riscv(x: usize) -> usize {
+pub(crate) fn leading_zeros_riscv<T: Int + CastInto<usize>>(x: T) -> usize {
     let mut x = x;
     // the number of potential leading zeros
-    let mut z = usize::MAX.count_ones() as usize;
+    let mut z = T::BITS;
     // a temporary
-    let mut t: usize;
+    let mut t: u32;
 
     // RISC-V does not have a set-if-greater-than-or-equal instruction and
     // `(x >= power-of-two) as usize` will get compiled into two instructions, but this is
@@ -95,11 +98,11 @@ pub(crate) fn usize_leading_zeros_riscv(x: usize) -> usize {
     // right). If we try to save an instruction by using `x < imm` for each bisection, we
     // have to shift `x` left and compare with powers of two approaching `usize::MAX + 1`,
     // but the immediate will never fit into 12 bits and never save an instruction.
-    #[cfg(target_pointer_width = "64")]
-    {
+    const { assert!(T::BITS <= 64) };
+    if T::BITS >= 64 {
         // If the upper 32 bits of `x` are not all 0, `t` is set to `1 << 5`, otherwise
         // `t` is set to 0.
-        t = ((x >= (1 << 32)) as usize) << 5;
+        t = ((x >= (T::ONE << 32)) as u32) << 5;
         // If `t` was set to `1 << 5`, then the upper 32 bits are shifted down for the
         // next step to process.
         x >>= t;
@@ -107,43 +110,56 @@ pub(crate) fn usize_leading_zeros_riscv(x: usize) -> usize {
         // leading zeros
         z -= t;
     }
-    #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
-    {
-        t = ((x >= (1 << 16)) as usize) << 4;
+    if T::BITS >= 32 {
+        t = ((x >= (T::ONE << 16)) as u32) << 4;
         x >>= t;
         z -= t;
     }
-    t = ((x >= (1 << 8)) as usize) << 3;
+    const { assert!(T::BITS >= 16) };
+    t = ((x >= (T::ONE << 8)) as u32) << 3;
     x >>= t;
     z -= t;
-    t = ((x >= (1 << 4)) as usize) << 2;
+    t = ((x >= (T::ONE << 4)) as u32) << 2;
     x >>= t;
     z -= t;
-    t = ((x >= (1 << 2)) as usize) << 1;
+    t = ((x >= (T::ONE << 2)) as u32) << 1;
     x >>= t;
     z -= t;
-    t = (x >= (1 << 1)) as usize;
+    t = (x >= (T::ONE << 1)) as u32;
     x >>= t;
     z -= t;
     // All bits except the LSB are guaranteed to be zero for this final bisection step.
     // If `x != 0` then `x == 1` and subtracts one potential zero from `z`.
-    z - x
+    z as usize - x.cast()
 }
 }
 
 intrinsics! {
-    #[maybe_use_optimized_c_shim]
-    #[cfg(any(
-        target_pointer_width = "16",
-        target_pointer_width = "32",
-        target_pointer_width = "64"
-    ))]
-    /// Returns the number of leading binary zeros in `x`.
-    pub extern "C" fn __clzsi2(x: usize) -> usize {
+    /// Returns the number of leading binary zeros in `x`
+    pub extern "C" fn __clzsi2(x: u32) -> usize {
         if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
-            usize_leading_zeros_riscv(x)
+            leading_zeros_riscv(x)
+        } else {
+            leading_zeros_default(x)
+        }
+    }
+
+    /// Returns the number of leading binary zeros in `x`
+    pub extern "C" fn __clzdi2(x: u64) -> usize {
+        if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
+            leading_zeros_riscv(x)
+        } else {
+            leading_zeros_default(x)
+        }
+    }
+
+    /// Returns the number of leading binary zeros in `x`
+    pub extern "C" fn __clzti2(x: u128) -> usize {
+        let hi = (x >> 64) as u64;
+        if hi == 0 {
+            64 + __clzdi2(x as u64)
         } else {
-            usize_leading_zeros_default(x)
+            __clzdi2(hi)
         }
     }
 }
diff --git a/src/int/mod.rs b/src/int/mod.rs
index 509f9fda..e6f31c53 100644
--- a/src/int/mod.rs
+++ b/src/int/mod.rs
@@ -3,42 +3,31 @@ use core::ops;
 mod specialized_div_rem;
 
 pub mod addsub;
+mod big;
+pub mod bswap;
 pub mod leading_zeros;
 pub mod mul;
 pub mod sdiv;
 pub mod shift;
+pub mod trailing_zeros;
 pub mod udiv;
 
-pub use self::leading_zeros::__clzsi2;
+pub use big::{i256, u256};
 
 public_test_dep! {
-/// Trait for some basic operations on integers
-pub(crate) trait Int:
-    Copy
+/// Minimal integer implementations needed on all integer types, including wide integers.
+#[allow(dead_code)]
+pub(crate) trait MinInt: Copy
     + core::fmt::Debug
-    + PartialEq
-    + PartialOrd
-    + ops::AddAssign
-    + ops::SubAssign
-    + ops::BitAndAssign
-    + ops::BitOrAssign
-    + ops::BitXorAssign
-    + ops::ShlAssign<i32>
-    + ops::ShrAssign<u32>
-    + ops::Add<Output = Self>
-    + ops::Sub<Output = Self>
-    + ops::Div<Output = Self>
-    + ops::Shl<u32, Output = Self>
-    + ops::Shr<u32, Output = Self>
     + ops::BitOr<Output = Self>
-    + ops::BitXor<Output = Self>
-    + ops::BitAnd<Output = Self>
     + ops::Not<Output = Self>
+    + ops::Shl<u32, Output = Self>
 {
+
     /// Type with the same width but other signedness
-    type OtherSign: Int;
+    type OtherSign: MinInt;
     /// Unsigned version of Self
-    type UnsignedInt: Int;
+    type UnsignedInt: MinInt;
 
     /// If `Self` is a signed integer
     const SIGNED: bool;
@@ -50,13 +39,47 @@ pub(crate) trait Int:
     const ONE: Self;
     const MIN: Self;
     const MAX: Self;
+}
+}
 
+public_test_dep! {
+/// Trait for some basic operations on integers
+#[allow(dead_code)]
+pub(crate) trait Int: MinInt
+    + PartialEq
+    + PartialOrd
+    + ops::AddAssign
+    + ops::SubAssign
+    + ops::BitAndAssign
+    + ops::BitOrAssign
+    + ops::BitXorAssign
+    + ops::ShlAssign<i32>
+    + ops::ShrAssign<u32>
+    + ops::Add<Output = Self>
+    + ops::Sub<Output = Self>
+    + ops::Mul<Output = Self>
+    + ops::Div<Output = Self>
+    + ops::Shr<u32, Output = Self>
+    + ops::BitXor<Output = Self>
+    + ops::BitAnd<Output = Self>
+{
     /// LUT used for maximizing the space covered and minimizing the computational cost of fuzzing
     /// in `testcrate`. For example, Self = u128 produces [0,1,2,7,8,15,16,31,32,63,64,95,96,111,
     /// 112,119,120,125,126,127].
-    const FUZZ_LENGTHS: [u8; 20];
+    const FUZZ_LENGTHS: [u8; 20] = make_fuzz_lengths(<Self as MinInt>::BITS);
+
     /// The number of entries of `FUZZ_LENGTHS` actually used. The maximum is 20 for u128.
-    const FUZZ_NUM: usize;
+    const FUZZ_NUM: usize = {
+        let log2 = (<Self as MinInt>::BITS - 1).count_ones() as usize;
+        if log2 == 3 {
+            // case for u8
+            6
+        } else {
+            // 3 entries on each extreme, 2 in the middle, and 4 for each scale of intermediate
+            // boundaries.
+            8 + (4 * (log2 - 4))
+        }
+    };
 
     fn unsigned(self) -> Self::UnsignedInt;
     fn from_unsigned(unsigned: Self::UnsignedInt) -> Self;
@@ -80,77 +103,58 @@ pub(crate) trait Int:
     fn rotate_left(self, other: u32) -> Self;
     fn overflowing_add(self, other: Self) -> (Self, bool);
     fn leading_zeros(self) -> u32;
+    fn ilog2(self) -> u32;
 }
 }
 
+pub(crate) const fn make_fuzz_lengths(bits: u32) -> [u8; 20] {
+    let mut v = [0u8; 20];
+    v[0] = 0;
+    v[1] = 1;
+    v[2] = 2; // important for parity and the iX::MIN case when reversed
+    let mut i = 3;
+
+    // No need for any more until the byte boundary, because there should be no algorithms
+    // that are sensitive to anything not next to byte boundaries after 2. We also scale
+    // in powers of two, which is important to prevent u128 corner tests from getting too
+    // big.
+    let mut l = 8;
+    loop {
+        if l >= ((bits / 2) as u8) {
+            break;
+        }
+        // get both sides of the byte boundary
+        v[i] = l - 1;
+        i += 1;
+        v[i] = l;
+        i += 1;
+        l *= 2;
+    }
+
+    if bits != 8 {
+        // add the lower side of the middle boundary
+        v[i] = ((bits / 2) - 1) as u8;
+        i += 1;
+    }
+
+    // We do not want to jump directly from the Self::BITS/2 boundary to the Self::BITS
+    // boundary because of algorithms that split the high part up. We reverse the scaling
+    // as we go to Self::BITS.
+    let mid = i;
+    let mut j = 1;
+    loop {
+        v[i] = (bits as u8) - (v[mid - j]) - 1;
+        if j == mid {
+            break;
+        }
+        i += 1;
+        j += 1;
+    }
+    v
+}
+
 macro_rules! int_impl_common {
     ($ty:ty) => {
-        const BITS: u32 = <Self as Int>::ZERO.count_zeros();
-        const SIGNED: bool = Self::MIN != Self::ZERO;
-
-        const ZERO: Self = 0;
-        const ONE: Self = 1;
-        const MIN: Self = <Self>::MIN;
-        const MAX: Self = <Self>::MAX;
-
-        const FUZZ_LENGTHS: [u8; 20] = {
-            let bits = <Self as Int>::BITS;
-            let mut v = [0u8; 20];
-            v[0] = 0;
-            v[1] = 1;
-            v[2] = 2; // important for parity and the iX::MIN case when reversed
-            let mut i = 3;
-            // No need for any more until the byte boundary, because there should be no algorithms
-            // that are sensitive to anything not next to byte boundaries after 2. We also scale
-            // in powers of two, which is important to prevent u128 corner tests from getting too
-            // big.
-            let mut l = 8;
-            loop {
-                if l >= ((bits / 2) as u8) {
-                    break;
-                }
-                // get both sides of the byte boundary
-                v[i] = l - 1;
-                i += 1;
-                v[i] = l;
-                i += 1;
-                l *= 2;
-            }
-
-            if bits != 8 {
-                // add the lower side of the middle boundary
-                v[i] = ((bits / 2) - 1) as u8;
-                i += 1;
-            }
-
-            // We do not want to jump directly from the Self::BITS/2 boundary to the Self::BITS
-            // boundary because of algorithms that split the high part up. We reverse the scaling
-            // as we go to Self::BITS.
-            let mid = i;
-            let mut j = 1;
-            loop {
-                v[i] = (bits as u8) - (v[mid - j]) - 1;
-                if j == mid {
-                    break;
-                }
-                i += 1;
-                j += 1;
-            }
-            v
-        };
-
-        const FUZZ_NUM: usize = {
-            let log2 = (<Self as Int>::BITS - 1).count_ones() as usize;
-            if log2 == 3 {
-                // case for u8
-                6
-            } else {
-                // 3 entries on each extreme, 2 in the middle, and 4 for each scale of intermediate
-                // boundaries.
-                8 + (4 * (log2 - 4))
-            }
-        };
-
         fn from_bool(b: bool) -> Self {
             b as $ty
         }
@@ -198,15 +202,29 @@ macro_rules! int_impl_common {
         fn leading_zeros(self) -> u32 {
             <Self>::leading_zeros(self)
         }
+
+        fn ilog2(self) -> u32 {
+            <Self>::ilog2(self)
+        }
     };
 }
 
 macro_rules! int_impl {
     ($ity:ty, $uty:ty) => {
-        impl Int for $uty {
+        impl MinInt for $uty {
             type OtherSign = $ity;
             type UnsignedInt = $uty;
 
+            const BITS: u32 = <Self as MinInt>::ZERO.count_zeros();
+            const SIGNED: bool = Self::MIN != Self::ZERO;
+
+            const ZERO: Self = 0;
+            const ONE: Self = 1;
+            const MIN: Self = <Self>::MIN;
+            const MAX: Self = <Self>::MAX;
+        }
+
+        impl Int for $uty {
             fn unsigned(self) -> $uty {
                 self
             }
@@ -228,10 +246,20 @@ macro_rules! int_impl {
             int_impl_common!($uty);
         }
 
-        impl Int for $ity {
+        impl MinInt for $ity {
             type OtherSign = $uty;
             type UnsignedInt = $uty;
 
+            const BITS: u32 = <Self as MinInt>::ZERO.count_zeros();
+            const SIGNED: bool = Self::MIN != Self::ZERO;
+
+            const ZERO: Self = 0;
+            const ONE: Self = 1;
+            const MIN: Self = <Self>::MIN;
+            const MAX: Self = <Self>::MAX;
+        }
+
+        impl Int for $ity {
             fn unsigned(self) -> $uty {
                 self as $uty
             }
@@ -259,18 +287,22 @@ int_impl!(i128, u128);
 public_test_dep! {
 /// Trait for integers twice the bit width of another integer. This is implemented for all
 /// primitives except for `u8`, because there is not a smaller primitive.
-pub(crate) trait DInt: Int {
+pub(crate) trait DInt: MinInt {
     /// Integer that is half the bit width of the integer this trait is implemented for
-    type H: HInt<D = Self> + Int;
+    type H: HInt<D = Self>;
 
     /// Returns the low half of `self`
     fn lo(self) -> Self::H;
     /// Returns the high half of `self`
     fn hi(self) -> Self::H;
     /// Returns the low and high halves of `self` as a tuple
-    fn lo_hi(self) -> (Self::H, Self::H);
+    fn lo_hi(self) -> (Self::H, Self::H) {
+        (self.lo(), self.hi())
+    }
     /// Constructs an integer using lower and higher half parts
-    fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self;
+    fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self {
+        lo.zero_widen() | hi.widen_hi()
+    }
 }
 }
 
@@ -279,7 +311,11 @@ public_test_dep! {
 /// primitives except for `u128`, because it there is not a larger primitive.
 pub(crate) trait HInt: Int {
     /// Integer that is double the bit width of the integer this trait is implemented for
-    type D: DInt<H = Self> + Int;
+    type D: DInt<H = Self> + MinInt;
+
+    // NB: some of the below methods could have default implementations (e.g. `widen_hi`), but for
+    // unknown reasons this can cause infinite recursion when optimizations are disabled. See
+    // <https://github.com/rust-lang/compiler-builtins/pull/707> for context.
 
     /// Widens (using default extension) the integer to have double bit width
     fn widen(self) -> Self::D;
@@ -305,13 +341,7 @@ macro_rules! impl_d_int {
                     self as $X
                 }
                 fn hi(self) -> Self::H {
-                    (self >> <$X as Int>::BITS) as $X
-                }
-                fn lo_hi(self) -> (Self::H, Self::H) {
-                    (self.lo(), self.hi())
-                }
-                fn from_lo_hi(lo: Self::H, hi: Self::H) -> Self {
-                    lo.zero_widen() | hi.widen_hi()
+                    (self >> <$X as MinInt>::BITS) as $X
                 }
             }
         )*
@@ -330,15 +360,15 @@ macro_rules! impl_h_int {
                 fn zero_widen(self) -> Self::D {
                     (self as $uH) as $X
                 }
-                fn widen_hi(self) -> Self::D {
-                    (self as $X) << <$H as Int>::BITS
-                }
                 fn zero_widen_mul(self, rhs: Self) -> Self::D {
                     self.zero_widen().wrapping_mul(rhs.zero_widen())
                 }
                 fn widen_mul(self, rhs: Self) -> Self::D {
                     self.widen().wrapping_mul(rhs.widen())
                 }
+                fn widen_hi(self) -> Self::D {
+                    (self as $X) << <Self as MinInt>::BITS
+                }
             }
         )*
     };
@@ -361,6 +391,16 @@ public_test_dep! {
 pub(crate) trait CastInto<T: Copy>: Copy {
     fn cast(self) -> T;
 }
+
+pub(crate) trait CastFrom<T: Copy>:Copy {
+    fn cast_from(value: T) -> Self;
+}
+}
+
+impl<T: Copy, U: CastInto<T> + Copy> CastFrom<U> for T {
+    fn cast_from(value: U) -> Self {
+        value.cast()
+    }
 }
 
 macro_rules! cast_into {
diff --git a/src/int/mul.rs b/src/int/mul.rs
index 07ce061c..e0093a72 100644
--- a/src/int/mul.rs
+++ b/src/int/mul.rs
@@ -1,6 +1,6 @@
-use int::{DInt, HInt, Int};
+use crate::int::{DInt, HInt, Int};
 
-trait Mul: DInt
+trait Mul: DInt + Int
 where
     Self::H: DInt,
 {
@@ -30,7 +30,7 @@ where
 impl Mul for u64 {}
 impl Mul for i128 {}
 
-pub(crate) trait UMulo: Int + DInt {
+pub(crate) trait UMulo: DInt + Int {
     fn mulo(self, rhs: Self) -> (Self, bool) {
         match (self.hi().is_zero(), rhs.hi().is_zero()) {
             // overflow is guaranteed
diff --git a/src/int/sdiv.rs b/src/int/sdiv.rs
index f1822f0f..9d316c76 100644
--- a/src/int/sdiv.rs
+++ b/src/int/sdiv.rs
@@ -1,4 +1,4 @@
-use int::udiv::*;
+use crate::int::udiv::*;
 
 macro_rules! sdivmod {
     (
diff --git a/src/int/shift.rs b/src/int/shift.rs
index c90cf1de..31727298 100644
--- a/src/int/shift.rs
+++ b/src/int/shift.rs
@@ -1,4 +1,4 @@
-use int::{DInt, HInt, Int};
+use crate::int::{DInt, HInt, Int, MinInt};
 
 trait Ashl: DInt {
     /// Returns `a << b`, requires `b < Self::BITS`
diff --git a/src/int/specialized_div_rem/binary_long.rs b/src/int/specialized_div_rem/binary_long.rs
index 0d782288..2c61a45e 100644
--- a/src/int/specialized_div_rem/binary_long.rs
+++ b/src/int/specialized_div_rem/binary_long.rs
@@ -13,9 +13,13 @@ macro_rules! impl_binary_long {
         $n:tt, // the number of bits in a $iX or $uX
         $uX:ident, // unsigned integer type for the inputs and outputs of `$fn`
         $iX:ident // signed integer type with same bitwidth as `$uX`
+        $(, $fun_attr:meta)* // attributes for the function
     ) => {
         /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
         /// tuple.
+        $(
+            #[$fun_attr]
+        )*
         pub fn $fn(duo: $uX, div: $uX) -> ($uX, $uX) {
             let mut duo = duo;
             // handle edge cases before calling `$normalization_shift`
diff --git a/src/int/specialized_div_rem/mod.rs b/src/int/specialized_div_rem/mod.rs
index 77034eb5..a91fe663 100644
--- a/src/int/specialized_div_rem/mod.rs
+++ b/src/int/specialized_div_rem/mod.rs
@@ -95,8 +95,9 @@ const USE_LZ: bool = {
         // LZD or LZCNT on SPARC only exists for the VIS 3 extension and later.
         cfg!(target_feature = "vis3")
     } else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
-        // The `B` extension on RISC-V determines if a CLZ assembly instruction exists
-        cfg!(target_feature = "b")
+        // The 'Zbb' Basic Bit-Manipulation extension on RISC-V
+        // determines if a CLZ assembly instruction exists
+        cfg!(target_feature = "zbb")
     } else {
         // All other common targets Rust supports should have CLZ instructions
         true
@@ -135,9 +136,15 @@ fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
 
 // Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a
 // microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
-// faster if the target pointer width is at least 64.
+// faster if the target pointer width is at least 64. Note that this
+// implementation is additionally included on WebAssembly despite the typical
+// pointer width there being 32 because it's typically run on a 64-bit machine
+// that has access to faster 64-bit operations.
 #[cfg(all(
-    not(any(target_pointer_width = "16", target_pointer_width = "32")),
+    any(
+        target_family = "wasm",
+        not(any(target_pointer_width = "16", target_pointer_width = "32")),
+    ),
     not(all(not(feature = "no-asm"), target_arch = "x86_64")),
     not(any(target_arch = "sparc", target_arch = "sparc64"))
 ))]
@@ -151,10 +158,14 @@ impl_trifecta!(
     u128
 );
 
-// If the pointer width less than 64, then the target architecture almost certainly does not have
-// the fast 64 to 128 bit widening multiplication needed for `trifecta` to be faster.
+// If the pointer width less than 64 and this isn't wasm, then the target
+// architecture almost certainly does not have the fast 64 to 128 bit widening
+// multiplication needed for `trifecta` to be faster.
 #[cfg(all(
-    any(target_pointer_width = "16", target_pointer_width = "32"),
+    not(any(
+        target_family = "wasm",
+        not(any(target_pointer_width = "16", target_pointer_width = "32")),
+    )),
     not(all(not(feature = "no-asm"), target_arch = "x86_64")),
     not(any(target_arch = "sparc", target_arch = "sparc64"))
 ))]
@@ -305,5 +316,6 @@ impl_binary_long!(
     u32_normalization_shift,
     32,
     u32,
-    i32
+    i32,
+    allow(dead_code)
 );
diff --git a/src/int/trailing_zeros.rs b/src/int/trailing_zeros.rs
new file mode 100644
index 00000000..cea366b0
--- /dev/null
+++ b/src/int/trailing_zeros.rs
@@ -0,0 +1,64 @@
+use crate::int::{CastInto, Int};
+
+public_test_dep! {
+/// Returns number of trailing binary zeros in `x`.
+#[allow(dead_code)]
+pub(crate) fn trailing_zeros<T: Int + CastInto<u32> + CastInto<u16> + CastInto<u8>>(x: T) -> usize {
+    let mut x = x;
+    let mut r: u32 = 0;
+    let mut t: u32;
+
+    const { assert!(T::BITS <= 64) };
+    if T::BITS >= 64 {
+        r += ((CastInto::<u32>::cast(x) == 0) as u32) << 5; // if (x has no 32 small bits) t = 32 else 0
+        x >>= r; // remove 32 zero bits
+    }
+
+    if T::BITS >= 32 {
+        t = ((CastInto::<u16>::cast(x) == 0) as u32) << 4; // if (x has no 16 small bits) t = 16 else 0
+        r += t;
+        x >>= t;         // x = [0 - 0xFFFF] + higher garbage bits
+    }
+
+    const { assert!(T::BITS >= 16) };
+    t = ((CastInto::<u8>::cast(x) == 0) as u32) << 3;
+    x >>= t; // x = [0 - 0xFF] + higher garbage bits
+    r += t;
+
+    let mut x: u8 = x.cast();
+
+    t = (((x & 0x0F) == 0) as u32) << 2;
+    x >>= t; // x = [0 - 0xF] + higher garbage bits
+    r += t;
+
+    t = (((x & 0x3) == 0) as u32) << 1;
+    x >>= t;  // x = [0 - 0x3] + higher garbage bits
+    r += t;
+
+    x &= 3;
+
+    r as usize + ((2 - (x >> 1) as usize) & (((x & 1) == 0) as usize).wrapping_neg())
+}
+}
+
+intrinsics! {
+    /// Returns the number of trailing binary zeros in `x` (32 bit version).
+    pub extern "C" fn __ctzsi2(x: u32) -> usize {
+        trailing_zeros(x)
+    }
+
+    /// Returns the number of trailing binary zeros in `x` (64 bit version).
+    pub extern "C" fn __ctzdi2(x: u64) -> usize {
+        trailing_zeros(x)
+    }
+
+    /// Returns the number of trailing binary zeros in `x` (128 bit version).
+    pub extern "C" fn __ctzti2(x: u128) -> usize {
+        let lo = x as u64;
+        if lo == 0 {
+            64 + __ctzdi2((x >> 64) as u64)
+        } else {
+            __ctzdi2(lo)
+        }
+    }
+}
diff --git a/src/int/udiv.rs b/src/int/udiv.rs
index fb09f87d..c891eede 100644
--- a/src/int/udiv.rs
+++ b/src/int/udiv.rs
@@ -1,8 +1,8 @@
 #[cfg(not(feature = "public-test-deps"))]
-pub(crate) use int::specialized_div_rem::*;
+pub(crate) use crate::int::specialized_div_rem::*;
 
 #[cfg(feature = "public-test-deps")]
-pub use int::specialized_div_rem::*;
+pub use crate::int::specialized_div_rem::*;
 
 intrinsics! {
     #[maybe_use_optimized_c_shim]
diff --git a/src/lib.miri.rs b/src/lib.miri.rs
new file mode 100644
index 00000000..17288058
--- /dev/null
+++ b/src/lib.miri.rs
@@ -0,0 +1,5 @@
+//! Grep bootstrap for `MIRI_REPLACE_LIBRS_IF_NOT_TEST` to learn what this is about.
+#![no_std]
+#![feature(rustc_private)]
+extern crate compiler_builtins as real;
+pub use real::*;
diff --git a/src/lib.rs b/src/lib.rs
index a6b61bdf..cfd796eb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,19 +1,18 @@
 #![cfg_attr(feature = "compiler-builtins", compiler_builtins)]
-#![cfg_attr(not(feature = "no-asm"), feature(asm))]
 #![feature(abi_unadjusted)]
-#![cfg_attr(not(feature = "no-asm"), feature(global_asm))]
+#![feature(asm_experimental_arch)]
 #![feature(cfg_target_has_atomic)]
 #![feature(compiler_builtins)]
-#![feature(core_ffi_c)]
 #![feature(core_intrinsics)]
-#![feature(inline_const)]
-#![feature(lang_items)]
 #![feature(linkage)]
 #![feature(naked_functions)]
 #![feature(repr_simd)]
+#![cfg_attr(f16_enabled, feature(f16))]
+#![cfg_attr(f128_enabled, feature(f128))]
 #![no_builtins]
 #![no_std]
 #![allow(unused_features)]
+#![allow(internal_features)]
 // We use `u128` in a whole bunch of places which we currently agree with the
 // compiler on ABIs and such, so we should be "good enough" for now and changes
 // to the `u128` ABI will be reflected here.
@@ -42,22 +41,25 @@ mod macros;
 pub mod float;
 pub mod int;
 
-#[cfg(any(
-    all(target_family = "wasm", target_os = "unknown"),
-    all(target_arch = "x86_64", target_os = "none"),
-    all(target_arch = "x86_64", target_os = "uefi"),
-    all(target_arch = "arm", target_os = "none"),
-    all(target_arch = "xtensa", target_os = "none"),
-    all(target_arch = "mips", target_os = "none"),
-    target_os = "xous",
-    all(target_vendor = "fortanix", target_env = "sgx")
-))]
+// Disable for any of the following:
+// - x86 without sse2 due to ABI issues
+//   - <https://github.com/rust-lang/rust/issues/114479>
+// - All unix targets (linux, macos, freebsd, android, etc)
+// - wasm with known target_os
+#[cfg(not(any(
+    all(target_arch = "x86", not(target_feature = "sse2")),
+    unix,
+    all(target_family = "wasm", not(target_os = "unknown"))
+)))]
 pub mod math;
 pub mod mem;
 
 #[cfg(target_arch = "arm")]
 pub mod arm;
 
+#[cfg(any(target_arch = "aarch64", target_arch = "arm64ec"))]
+pub mod aarch64;
+
 #[cfg(all(target_arch = "aarch64", target_os = "linux", not(feature = "no-asm"),))]
 pub mod aarch64_linux;
 
@@ -68,6 +70,9 @@ pub mod aarch64_linux;
 ))]
 pub mod arm_linux;
 
+#[cfg(target_arch = "hexagon")]
+pub mod hexagon;
+
 #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))]
 pub mod riscv;
 
@@ -77,4 +82,12 @@ pub mod x86;
 #[cfg(target_arch = "x86_64")]
 pub mod x86_64;
 
+#[cfg(all(target_os = "solana", target_feature = "static-syscalls"))]
+#[cfg_attr(not(feature = "mangled-names"), no_mangle)]
+#[linkage = "weak"]
+pub unsafe extern "C" fn abort() -> ! {
+    let syscall: extern "C" fn() -> ! = core::mem::transmute(3069975057u64); // murmur32 hash of "abort"
+    syscall()
+}
+
 pub mod probestack;
diff --git a/src/macros.rs b/src/macros.rs
index b11114f1..f51e49e9 100644
--- a/src/macros.rs
+++ b/src/macros.rs
@@ -25,11 +25,12 @@ macro_rules! public_test_dep {
 /// platforms need and elsewhere in this library it just looks like normal Rust
 /// code.
 ///
-/// When the weak-intrinsics feature is enabled, all intrinsics functions are
-/// marked with #[linkage = "weak"] so that they can be replaced by another
-/// implementation at link time. This is particularly useful for mixed Rust/C++
-/// binaries that want to use the C++ intrinsics, otherwise linking against the
-/// Rust stdlib will replace those from the compiler-rt library.
+/// All intrinsics functions are marked with #[linkage = "weak"] when
+/// `not(windows) and not(target_vendor = "apple")`.
+/// `weak` linkage attribute is used so that these functions can be replaced
+/// by another implementation at link time. This is particularly useful for mixed
+/// Rust/C++ binaries that want to use the C++ intrinsics, otherwise linking against
+/// the Rust stdlib will replace those from the compiler-rt library.
 ///
 /// This macro is structured to be invoked with a bunch of functions that looks
 /// like:
@@ -53,10 +54,6 @@ macro_rules! public_test_dep {
 ///
 /// A quick overview of attributes supported right now are:
 ///
-/// * `weak` - indicates that the function should always be given weak linkage.
-///   This attribute must come before other attributes, as the other attributes
-///   will generate the final output function and need to have `weak` modify
-///   them.
 /// * `maybe_use_optimized_c_shim` - indicates that the Rust implementation is
 ///   ignored if an optimized C version was compiled.
 /// * `aapcs_on_arm` - forces the ABI of the function to be `"aapcs"` on ARM and
@@ -68,6 +65,9 @@ macro_rules! public_test_dep {
 ///   it's a normal ABI elsewhere for returning a 128 bit integer.
 /// * `arm_aeabi_alias` - handles the "aliasing" of various intrinsics on ARM
 ///   their otherwise typical names to other prefixed ones.
+/// * `ppc_alias` - changes the name of the symbol on PowerPC platforms without
+///   changing any other behavior. This is mostly for `f128`, which is `tf` on
+///   most platforms but `kf` on PowerPC.
 macro_rules! intrinsics {
     () => ();
 
@@ -128,67 +128,6 @@ macro_rules! intrinsics {
         intrinsics!($($rest)*);
     );
 
-    // Explicit weak linkage gets dropped when weak-intrinsics is on since it
-    // will be added unconditionally to all intrinsics and would conflict
-    // otherwise.
-    (
-        #[weak]
-        $(#[$($attr:tt)*])*
-        pub extern $abi:tt fn $name:ident( $($argname:ident:  $ty:ty),* ) $(-> $ret:ty)? {
-            $($body:tt)*
-        }
-
-        $($rest:tt)*
-    ) => (
-        #[cfg(feature = "weak-intrinsics")]
-        intrinsics! {
-            $(#[$($attr)*])*
-            pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
-                $($body)*
-            }
-        }
-
-        #[cfg(not(feature = "weak-intrinsics"))]
-        intrinsics! {
-            $(#[$($attr)*])*
-            #[linkage = "weak"]
-            pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
-                $($body)*
-            }
-        }
-
-        intrinsics!($($rest)*);
-    );
-    // Same as above but for unsafe.
-    (
-        #[weak]
-        $(#[$($attr:tt)*])*
-        pub unsafe extern $abi:tt fn $name:ident( $($argname:ident:  $ty:ty),* ) $(-> $ret:ty)? {
-            $($body:tt)*
-        }
-
-        $($rest:tt)*
-    ) => (
-        #[cfg(feature = "weak-intrinsics")]
-        intrinsics! {
-            $(#[$($attr)*])*
-            pub unsafe extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
-                $($body)*
-            }
-        }
-
-        #[cfg(not(feature = "weak-intrinsics"))]
-        intrinsics! {
-            $(#[$($attr)*])*
-            #[linkage = "weak"]
-            pub unsafe extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
-                $($body)*
-            }
-        }
-
-        intrinsics!($($rest)*);
-    );
-
     // Right now there's a bunch of architecture-optimized intrinsics in the
     // stock compiler-rt implementation. Not all of these have been ported over
     // to Rust yet so when the `c` feature of this crate is enabled we fall back
@@ -211,7 +150,6 @@ macro_rules! intrinsics {
         $($rest:tt)*
     ) => (
         #[cfg($name = "optimized-c")]
-        #[cfg_attr(feature = "weak-intrinsics", linkage = "weak")]
         pub $(unsafe $($empty)? )? extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
             extern $abi {
                 fn $name($($argname: $ty),*) $(-> $ret)?;
@@ -311,20 +249,19 @@ macro_rules! intrinsics {
     ) => (
         #[cfg(all(any(windows, target_os = "uefi"), target_arch = "x86_64"))]
         $(#[$($attr)*])*
-        #[cfg_attr(feature = "weak-intrinsics", linkage = "weak")]
         pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
             $($body)*
         }
 
-        #[cfg(all(any(windows, target_os = "uefi"), target_arch = "x86_64"))]
-        pub mod $name {
-            #[cfg_attr(not(feature = "mangled-names"), no_mangle)]
-            #[cfg_attr(feature = "weak-intrinsics", linkage = "weak")]
-            pub extern $abi fn $name( $($argname: $ty),* )
-                -> ::macros::win64_128bit_abi_hack::U64x2
+        #[cfg(all(any(windows, target_os = "uefi"), target_arch = "x86_64", not(feature = "mangled-names")))]
+        mod $name {
+            #[no_mangle]
+            #[cfg_attr(not(all(windows, target_env = "gnu")), linkage = "weak")]
+            extern $abi fn $name( $($argname: $ty),* )
+                -> $crate::macros::win64_128bit_abi_hack::U64x2
             {
                 let e: $($ret)? = super::$name($($argname),*);
-                ::macros::win64_128bit_abi_hack::U64x2::from(e)
+                $crate::macros::win64_128bit_abi_hack::U64x2::from(e)
             }
         }
 
@@ -339,6 +276,106 @@ macro_rules! intrinsics {
         intrinsics!($($rest)*);
     );
 
+    // `arm_aeabi_alias` would conflict with `f16_apple_{arg,ret}_abi` not handled here. Avoid macro ambiguity by combining in a
+    // single `#[]`.
+    (
+        #[apple_f16_arg_abi]
+        #[arm_aeabi_alias = $alias:ident]
+        $($t:tt)*
+    ) => {
+        intrinsics! {
+            #[apple_f16_arg_abi, arm_aeabi_alias = $alias]
+            $($t)*
+        }
+    };
+    (
+        #[apple_f16_ret_abi]
+        #[arm_aeabi_alias = $alias:ident]
+        $($t:tt)*
+    ) => {
+        intrinsics! {
+            #[apple_f16_ret_abi, arm_aeabi_alias = $alias]
+            $($t)*
+        }
+    };
+
+    // On x86 (32-bit and 64-bit) Apple platforms, `f16` is passed and returned like a `u16` unless
+    // the builtin involves `f128`.
+    (
+        // `arm_aeabi_alias` would conflict if not handled here. Avoid macro ambiguity by combining
+        // in a single `#[]`.
+        #[apple_f16_arg_abi $(, arm_aeabi_alias = $alias:ident)?]
+        $(#[$($attr:tt)*])*
+        pub extern $abi:tt fn $name:ident( $($argname:ident:  $ty:ty),* ) $(-> $ret:ty)? {
+            $($body:tt)*
+        }
+
+        $($rest:tt)*
+    ) => (
+        #[cfg(all(target_vendor = "apple", any(target_arch = "x86", target_arch = "x86_64")))]
+        $(#[$($attr)*])*
+        pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+            $($body)*
+        }
+
+        #[cfg(all(target_vendor = "apple", any(target_arch = "x86", target_arch = "x86_64"), not(feature = "mangled-names")))]
+        mod $name {
+            #[no_mangle]
+            #[cfg_attr(not(all(windows, target_env = "gnu")), linkage = "weak")]
+            $(#[$($attr)*])*
+            extern $abi fn $name( $($argname: u16),* ) $(-> $ret)? {
+                super::$name($(f16::from_bits($argname)),*)
+            }
+        }
+
+        #[cfg(not(all(target_vendor = "apple", any(target_arch = "x86", target_arch = "x86_64"))))]
+        intrinsics! {
+            $(#[arm_aeabi_alias = $alias])?
+            $(#[$($attr)*])*
+            pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+                $($body)*
+            }
+        }
+
+        intrinsics!($($rest)*);
+    );
+    (
+        #[apple_f16_ret_abi $(, arm_aeabi_alias = $alias:ident)?]
+        $(#[$($attr:tt)*])*
+        pub extern $abi:tt fn $name:ident( $($argname:ident:  $ty:ty),* ) $(-> $ret:ty)? {
+            $($body:tt)*
+        }
+
+        $($rest:tt)*
+    ) => (
+        #[cfg(all(target_vendor = "apple", any(target_arch = "x86", target_arch = "x86_64")))]
+        $(#[$($attr)*])*
+        pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+            $($body)*
+        }
+
+        #[cfg(all(target_vendor = "apple", any(target_arch = "x86", target_arch = "x86_64"), not(feature = "mangled-names")))]
+        mod $name {
+            #[no_mangle]
+            #[cfg_attr(not(all(windows, target_env = "gnu")), linkage = "weak")]
+            $(#[$($attr)*])*
+            extern $abi fn $name( $($argname: $ty),* ) -> u16 {
+                super::$name($($argname),*).to_bits()
+            }
+        }
+
+        #[cfg(not(all(target_vendor = "apple", any(target_arch = "x86", target_arch = "x86_64"))))]
+        intrinsics! {
+            $(#[arm_aeabi_alias = $alias])?
+            $(#[$($attr)*])*
+            pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+                $($body)*
+            }
+        }
+
+        intrinsics!($($rest)*);
+    );
+
     // A bunch of intrinsics on ARM are aliased in the standard compiler-rt
     // build under `__aeabi_*` aliases, and LLVM will call these instead of the
     // original function. The aliasing here is used to generate these symbols in
@@ -353,24 +390,27 @@ macro_rules! intrinsics {
         $($rest:tt)*
     ) => (
         #[cfg(target_arch = "arm")]
+        $(#[$($attr)*])*
         pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
             $($body)*
         }
 
-        #[cfg(target_arch = "arm")]
-        pub mod $name {
-            #[cfg_attr(not(feature = "mangled-names"), no_mangle)]
-            #[cfg_attr(feature = "weak-intrinsics", linkage = "weak")]
-            pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+        #[cfg(all(target_arch = "arm", not(feature = "mangled-names")))]
+        mod $name {
+            #[no_mangle]
+            #[cfg_attr(not(all(windows, target_env = "gnu")), linkage = "weak")]
+            $(#[$($attr)*])*
+            extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
                 super::$name($($argname),*)
             }
         }
 
-        #[cfg(target_arch = "arm")]
-        pub mod $alias {
-            #[cfg_attr(not(feature = "mangled-names"), no_mangle)]
-            #[cfg_attr(any(all(not(windows), not(target_vendor="apple"), feature = "weak-intrinsics")), linkage = "weak")]
-            pub extern "aapcs" fn $alias( $($argname: $ty),* ) $(-> $ret)? {
+        #[cfg(all(target_arch = "arm", not(feature = "mangled-names")))]
+        mod $alias {
+            #[no_mangle]
+            #[cfg_attr(not(all(windows, target_env = "gnu")), linkage = "weak")]
+            $(#[$($attr)*])*
+            extern "aapcs" fn $alias( $($argname: $ty),* ) $(-> $ret)? {
                 super::$name($($argname),*)
             }
         }
@@ -386,6 +426,36 @@ macro_rules! intrinsics {
         intrinsics!($($rest)*);
     );
 
+    // PowerPC usually uses `kf` rather than `tf` for `f128`. This is just an easy
+    // way to add an alias on those targets.
+    (
+        #[ppc_alias = $alias:ident]
+        $(#[$($attr:tt)*])*
+        pub extern $abi:tt fn $name:ident( $($argname:ident:  $ty:ty),* ) $(-> $ret:ty)? {
+            $($body:tt)*
+        }
+
+        $($rest:tt)*
+    ) => (
+        #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+        intrinsics! {
+            $(#[$($attr)*])*
+            pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+                $($body)*
+            }
+        }
+
+        #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+        intrinsics! {
+            $(#[$($attr)*])*
+            pub extern $abi fn $alias( $($argname: $ty),* ) $(-> $ret)? {
+                $($body)*
+            }
+        }
+
+        intrinsics!($($rest)*);
+    );
+
     // C mem* functions are only generated when the "mem" feature is enabled.
     (
         #[mem_builtin]
@@ -401,12 +471,12 @@ macro_rules! intrinsics {
             $($body)*
         }
 
-        #[cfg(feature = "mem")]
-        pub mod $name {
+        #[cfg(all(feature = "mem", not(feature = "mangled-names")))]
+        mod $name {
             $(#[$($attr)*])*
-            #[cfg_attr(not(feature = "mangled-names"), no_mangle)]
-            #[cfg_attr(feature = "weak-intrinsics", linkage = "weak")]
-            pub unsafe extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+            #[no_mangle]
+            #[cfg_attr(not(all(windows, target_env = "gnu")), linkage = "weak")]
+            unsafe extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
                 super::$name($($argname),*)
             }
         }
@@ -425,11 +495,12 @@ macro_rules! intrinsics {
 
         $($rest:tt)*
     ) => (
+        // `#[naked]` definitions are referenced by other places, so we can't use `cfg` like the others
         pub mod $name {
             #[naked]
             $(#[$($attr)*])*
             #[cfg_attr(not(feature = "mangled-names"), no_mangle)]
-            #[cfg_attr(feature = "weak-intrinsics", linkage = "weak")]
+            #[cfg_attr(not(all(windows, target_env = "gnu")), linkage = "weak")]
             pub unsafe extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
                 $($body)*
             }
@@ -481,48 +552,23 @@ macro_rules! intrinsics {
     // input we were given.
     (
         $(#[$($attr:tt)*])*
-        pub extern $abi:tt fn $name:ident( $($argname:ident:  $ty:ty),* ) $(-> $ret:ty)? {
+        pub $(unsafe $(@ $empty:tt)?)? extern $abi:tt fn $name:ident( $($argname:ident:  $ty:ty),* ) $(-> $ret:ty)? {
             $($body:tt)*
         }
 
         $($rest:tt)*
     ) => (
         $(#[$($attr)*])*
-        pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+        pub $(unsafe $($empty)?)? extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
             $($body)*
         }
 
-        pub mod $name {
+        #[cfg(not(feature = "mangled-names"))]
+        mod $name {
             $(#[$($attr)*])*
-            #[cfg_attr(not(feature = "mangled-names"), no_mangle)]
-            #[cfg_attr(feature = "weak-intrinsics", linkage = "weak")]
-            pub extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
-                super::$name($($argname),*)
-            }
-        }
-
-        intrinsics!($($rest)*);
-    );
-
-    // Same as the above for unsafe functions.
-    (
-        $(#[$($attr:tt)*])*
-        pub unsafe extern $abi:tt fn $name:ident( $($argname:ident:  $ty:ty),* ) $(-> $ret:ty)? {
-            $($body:tt)*
-        }
-
-        $($rest:tt)*
-    ) => (
-        $(#[$($attr)*])*
-        pub unsafe extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
-            $($body)*
-        }
-
-        pub mod $name {
-            $(#[$($attr)*])*
-            #[cfg_attr(not(feature = "mangled-names"), no_mangle)]
-            #[cfg_attr(feature = "weak-intrinsics", linkage = "weak")]
-            pub unsafe extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
+            #[no_mangle]
+            #[cfg_attr(not(all(windows, target_env = "gnu")), linkage = "weak")]
+            $(unsafe $($empty)?)? extern $abi fn $name( $($argname: $ty),* ) $(-> $ret)? {
                 super::$name($($argname),*)
             }
         }
@@ -536,20 +582,20 @@ macro_rules! intrinsics {
 #[cfg(all(any(windows, target_os = "uefi"), target_pointer_width = "64"))]
 pub mod win64_128bit_abi_hack {
     #[repr(simd)]
-    pub struct U64x2(u64, u64);
+    pub struct U64x2([u64; 2]);
 
     impl From<i128> for U64x2 {
         fn from(i: i128) -> U64x2 {
-            use int::DInt;
+            use crate::int::DInt;
             let j = i as u128;
-            U64x2(j.lo(), j.hi())
+            U64x2([j.lo(), j.hi()])
         }
     }
 
     impl From<u128> for U64x2 {
         fn from(i: u128) -> U64x2 {
-            use int::DInt;
-            U64x2(i.lo(), i.hi())
+            use crate::int::DInt;
+            U64x2([i.lo(), i.hi()])
         }
     }
 }
diff --git a/src/math.rs b/src/math.rs
index b4e5fc11..477dfe36 100644
--- a/src/math.rs
+++ b/src/math.rs
@@ -1,7 +1,10 @@
 #[allow(dead_code)]
+#[allow(unused_imports)]
+#[allow(clippy::all)]
 #[path = "../libm/src/math/mod.rs"]
 mod libm;
 
+#[allow(unused_macros)]
 macro_rules! no_mangle {
     ($(fn $fun:ident($($iid:ident : $ity:ty),+) -> $oty:ty;)+) => {
         intrinsics! {
@@ -14,17 +17,7 @@ macro_rules! no_mangle {
     }
 }
 
-#[cfg(any(
-    all(
-        target_family = "wasm",
-        target_os = "unknown",
-        not(target_env = "wasi")
-    ),
-    target_os = "xous",
-    all(target_arch = "x86_64", target_os = "uefi"),
-    all(target_arch = "xtensa", target_os = "none"),
-    all(target_vendor = "fortanix", target_env = "sgx")
-))]
+#[cfg(not(windows))]
 no_mangle! {
     fn acos(x: f64) -> f64;
     fn asin(x: f64) -> f64;
@@ -40,10 +33,6 @@ no_mangle! {
     fn log10f(x: f32) -> f32;
     fn log(x: f64) -> f64;
     fn logf(x: f32) -> f32;
-    fn fmin(x: f64, y: f64) -> f64;
-    fn fminf(x: f32, y: f32) -> f32;
-    fn fmax(x: f64, y: f64) -> f64;
-    fn fmaxf(x: f32, y: f32) -> f32;
     fn round(x: f64) -> f64;
     fn roundf(x: f32) -> f32;
     fn rint(x: f64) -> f64;
@@ -51,8 +40,6 @@ no_mangle! {
     fn sin(x: f64) -> f64;
     fn pow(x: f64, y: f64) -> f64;
     fn powf(x: f32, y: f32) -> f32;
-    fn fmod(x: f64, y: f64) -> f64;
-    fn fmodf(x: f32, y: f32) -> f32;
     fn acosf(n: f32) -> f32;
     fn atan2f(a: f32, b: f32) -> f32;
     fn atanf(n: f32) -> f32;
@@ -84,66 +71,17 @@ no_mangle! {
     fn cbrtf(n: f32) -> f32;
     fn hypotf(x: f32, y: f32) -> f32;
     fn tanf(n: f32) -> f32;
-}
-
-#[cfg(any(
-    all(
-        target_family = "wasm",
-        target_os = "unknown",
-        not(target_env = "wasi")
-    ),
-    target_os = "xous",
-    all(target_arch = "x86_64", target_os = "uefi"),
-    all(target_arch = "xtensa", target_os = "none"),
-    all(target_vendor = "fortanix", target_env = "sgx")
-))]
-intrinsics! {
-    pub extern "C" fn lgamma_r(x: f64, s: &mut i32) -> f64 {
-        let r = self::libm::lgamma_r(x);
-        *s = r.1;
-        r.0
-    }
-
-    pub extern "C" fn lgammaf_r(x: f32, s: &mut i32) -> f32 {
-        let r = self::libm::lgammaf_r(x);
-        *s = r.1;
-        r.0
-    }
-}
 
-#[cfg(any(
-    target_os = "xous",
-    target_os = "uefi",
-    all(target_arch = "xtensa", target_os = "none"),
-))]
-no_mangle! {
     fn sqrtf(x: f32) -> f32;
     fn sqrt(x: f64) -> f64;
-}
 
-#[cfg(any(
-    all(target_vendor = "fortanix", target_env = "sgx"),
-    all(target_arch = "xtensa", target_os = "none"),
-    target_os = "xous",
-    target_os = "uefi"
-))]
-no_mangle! {
     fn ceil(x: f64) -> f64;
     fn ceilf(x: f32) -> f32;
     fn floor(x: f64) -> f64;
     fn floorf(x: f32) -> f32;
     fn trunc(x: f64) -> f64;
     fn truncf(x: f32) -> f32;
-}
 
-// only for the thumb*-none-eabi*, riscv32*-none-elf, x86_64-unknown-none and mips*-unknown-none targets that lack the floating point instruction set
-#[cfg(any(
-    all(target_arch = "arm", target_os = "none"),
-    all(target_arch = "riscv32", not(target_feature = "f"), target_os = "none"),
-    all(target_arch = "x86_64", target_os = "none"),
-    all(target_arch = "mips", target_os = "none"),
-))]
-no_mangle! {
     fn fmin(x: f64, y: f64) -> f64;
     fn fminf(x: f32, y: f32) -> f32;
     fn fmax(x: f64, y: f64) -> f64;
@@ -153,3 +91,18 @@ no_mangle! {
     // `f32 % f32`
     fn fmodf(x: f32, y: f32) -> f32;
 }
+
+// allow for windows (and other targets)
+intrinsics! {
+    pub extern "C" fn lgamma_r(x: f64, s: &mut i32) -> f64 {
+        let r = self::libm::lgamma_r(x);
+        *s = r.1;
+        r.0
+    }
+
+    pub extern "C" fn lgammaf_r(x: f32, s: &mut i32) -> f32 {
+        let r = self::libm::lgammaf_r(x);
+        *s = r.1;
+        r.0
+    }
+}
diff --git a/src/mem/mod.rs b/src/mem/mod.rs
index ccf19177..c22c3a7e 100644
--- a/src/mem/mod.rs
+++ b/src/mem/mod.rs
@@ -8,26 +8,29 @@ type c_int = i16;
 #[cfg(not(target_pointer_width = "16"))]
 type c_int = i32;
 
+#[cfg(not(target_os = "solana"))]
 use core::intrinsics::{atomic_load_unordered, atomic_store_unordered, exact_div};
+#[cfg(not(target_os = "solana"))]
 use core::mem;
+#[cfg(not(target_os = "solana"))]
 use core::ops::{BitOr, Shl};
 
 // memcpy/memmove/memset have optimized implementations on some architectures
+#[cfg(not(target_os = "solana"))]
 #[cfg_attr(
     all(not(feature = "no-asm"), target_arch = "x86_64"),
     path = "x86_64.rs"
 )]
 mod impls;
 
+#[cfg(not(target_os = "solana"))]
 intrinsics! {
-    #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), weak)]
     #[mem_builtin]
     pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
         impls::copy_forward(dest, src, n);
         dest
     }
 
-    #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), weak)]
     #[mem_builtin]
     pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
         let delta = (dest as usize).wrapping_sub(src as usize);
@@ -41,26 +44,22 @@ intrinsics! {
         dest
     }
 
-    #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), weak)]
     #[mem_builtin]
     pub unsafe extern "C" fn memset(s: *mut u8, c: crate::mem::c_int, n: usize) -> *mut u8 {
         impls::set_bytes(s, c as u8, n);
         s
     }
 
-    #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), weak)]
     #[mem_builtin]
     pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
         impls::compare_bytes(s1, s2, n)
     }
 
-    #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), weak)]
     #[mem_builtin]
     pub unsafe extern "C" fn bcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
         memcmp(s1, s2, n)
     }
 
-    #[cfg_attr(not(all(target_os = "windows", target_env = "gnu")), weak)]
     #[mem_builtin]
     pub unsafe extern "C" fn strlen(s: *const core::ffi::c_char) -> usize {
         impls::c_string_length(s)
@@ -68,6 +67,7 @@ intrinsics! {
 }
 
 // `bytes` must be a multiple of `mem::size_of::<T>()`
+#[cfg(not(target_os = "solana"))]
 #[cfg_attr(not(target_has_atomic_load_store = "8"), allow(dead_code))]
 fn memcpy_element_unordered_atomic<T: Copy>(dest: *mut T, src: *const T, bytes: usize) {
     unsafe {
@@ -81,6 +81,7 @@ fn memcpy_element_unordered_atomic<T: Copy>(dest: *mut T, src: *const T, bytes:
 }
 
 // `bytes` must be a multiple of `mem::size_of::<T>()`
+#[cfg(not(target_os = "solana"))]
 #[cfg_attr(not(target_has_atomic_load_store = "8"), allow(dead_code))]
 fn memmove_element_unordered_atomic<T: Copy>(dest: *mut T, src: *const T, bytes: usize) {
     unsafe {
@@ -104,6 +105,7 @@ fn memmove_element_unordered_atomic<T: Copy>(dest: *mut T, src: *const T, bytes:
 }
 
 // `T` must be a primitive integer type, and `bytes` must be a multiple of `mem::size_of::<T>()`
+#[cfg(not(target_os = "solana"))]
 #[cfg_attr(not(target_has_atomic_load_store = "8"), allow(dead_code))]
 fn memset_element_unordered_atomic<T>(s: *mut T, c: u8, bytes: usize)
 where
@@ -130,6 +132,7 @@ where
     }
 }
 
+#[cfg(not(target_os = "solana"))]
 intrinsics! {
     #[cfg(target_has_atomic_load_store = "8")]
     pub unsafe extern "C" fn __llvm_memcpy_element_unordered_atomic_1(dest: *mut u8, src: *const u8, bytes: usize) -> () {
@@ -194,3 +197,204 @@ intrinsics! {
         memset_element_unordered_atomic(s, c, bytes);
     }
 }
+
+// MEM functions have been rewritten to copy 8 byte chunks.  No
+// compensation for alignment is made here with the requirement that
+// the underlying hardware supports unaligned loads/stores.  If the
+// number of store operations is greater than 8 the memory operation
+// is performed in the run-time system instead, by calling the
+// corresponding "C" function.
+
+#[cfg(all(target_os = "solana", not(target_feature = "static-syscalls")))]
+mod syscalls {
+    extern "C" {
+        pub fn sol_memcpy_(dest: *mut u8, src: *const u8, n: u64);
+        pub fn sol_memmove_(dest: *mut u8, src: *const u8, n: u64);
+        pub fn sol_memset_(s: *mut u8, c: u8, n: u64);
+        pub fn sol_memcmp_(s1: *const u8, s2: *const u8, n: u64, result: *mut i32);
+    }
+}
+
+#[cfg(all(target_os = "solana", target_feature = "static-syscalls"))]
+mod syscalls {
+    pub(crate) fn sol_memcpy_(dest: *mut u8, src: *const u8, n: u64) {
+        let syscall: extern "C" fn(*mut u8, *const u8, u64) =
+            unsafe { core::mem::transmute(1904002211u64) }; // murmur32 hash of "sol_memcpy_"
+        syscall(dest, src, n)
+    }
+
+    pub(crate) fn sol_memmove_(dest: *mut u8, src: *const u8, n: u64) {
+        let syscall: extern "C" fn(*mut u8, *const u8, u64) =
+            unsafe { core::mem::transmute(1128493560u64) }; // murmur32 hash of "sol_memmove_"
+        syscall(dest, src, n)
+    }
+
+    pub(crate) fn sol_memcmp_(dest: *const u8, src: *const u8, n: u64, result: *mut i32) {
+        let syscall: extern "C" fn(*const u8, *const u8, u64, *mut i32) =
+            unsafe { core::mem::transmute(1608310321u64) }; // murmur32 hash of "sol_memcmp_"
+        syscall(dest, src, n, result)
+    }
+
+    pub(crate) fn sol_memset_(dest: *mut u8, c: u8, n: u64) {
+        let syscall: extern "C" fn(*mut u8, u8, u64) =
+            unsafe { core::mem::transmute(930151202u64) }; // murmur32 hash of "sol_memset_"
+        syscall(dest, c, n)
+    }
+}
+
+#[cfg(target_os = "solana")]
+use self::syscalls::*;
+
+#[cfg(target_os = "solana")]
+const NSTORE_THRESHOLD: usize = 15;
+
+#[cfg(target_os = "solana")]
+#[cfg_attr(
+    all(feature = "mem-unaligned", not(feature = "mangled-names")),
+    no_mangle
+)]
+#[inline]
+pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
+    let chunks = (n / 8) as isize;
+    let nstore = n - (7 * chunks) as usize;
+    if nstore > NSTORE_THRESHOLD {
+        sol_memcpy_(dest, src, n as u64);
+        return dest;
+    }
+    let mut i: isize = 0;
+    if chunks != 0 {
+        let dest_64 = dest as *mut _ as *mut u64;
+        let src_64 = src as *const _ as *const u64;
+        while i < chunks {
+            *dest_64.offset(i) = *src_64.offset(i);
+            i += 1;
+        }
+        i *= 8;
+    }
+    while i < n as isize {
+        *dest.offset(i) = *src.offset(i);
+        i += 1;
+    }
+    dest
+}
+
+#[cfg(target_os = "solana")]
+#[cfg_attr(
+    all(feature = "mem-unaligned", not(feature = "mangled-names")),
+    no_mangle
+)]
+#[inline]
+pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
+    let chunks = (n / 8) as isize;
+    let nstore = n - (7 * chunks) as usize;
+    if nstore > NSTORE_THRESHOLD {
+        sol_memmove_(dest, src, n as u64);
+        return dest;
+    }
+    if src < dest as *const u8 {
+        // copy from end
+        let mut i = n as isize;
+        while i > chunks * 8 {
+            i -= 1;
+            *dest.offset(i) = *src.offset(i);
+        }
+        i = chunks;
+        if i > 0 {
+            let dest_64 = dest as *mut _ as *mut u64;
+            let src_64 = src as *const _ as *const u64;
+            while i > 0 {
+                i -= 1;
+                *dest_64.offset(i) = *src_64.offset(i);
+            }
+        }
+    } else {
+        // copy from beginning
+        let mut i: isize = 0;
+        if chunks != 0 {
+            let dest_64 = dest as *mut _ as *mut u64;
+            let src_64 = src as *const _ as *const u64;
+            while i < chunks {
+                *dest_64.offset(i) = *src_64.offset(i);
+                i += 1;
+            }
+            i *= 8;
+        }
+        while i < n as isize {
+            *dest.offset(i) = *src.offset(i);
+            i += 1;
+        }
+    }
+    dest
+}
+
+#[cfg(target_os = "solana")]
+#[cfg_attr(
+    all(feature = "mem-unaligned", not(feature = "mangled-names")),
+    no_mangle
+)]
+#[inline]
+pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 {
+    let chunks = (n / 8) as isize;
+    let nstore = n - (7 * chunks) as usize;
+    if nstore > NSTORE_THRESHOLD {
+        sol_memset_(s, c as u8, n as u64);
+        return s;
+    }
+    let mut i: isize = 0;
+    if chunks != 0 {
+        let mut c_64 = c as u64 & 0xFF as u64;
+        c_64 |= c_64 << 8;
+        c_64 |= c_64 << 16;
+        c_64 |= c_64 << 32;
+        let s_64 = s as *mut _ as *mut u64;
+        while i < chunks {
+            *s_64.offset(i) = c_64;
+            i += 1;
+        }
+        i *= 8;
+    }
+    while i < n as isize {
+        *s.offset(i) = c as u8;
+        i += 1;
+    }
+    s
+}
+
+#[cfg(target_os = "solana")]
+#[cfg_attr(
+    all(feature = "mem-unaligned", not(feature = "mangled-names")),
+    no_mangle
+)]
+#[inline]
+pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
+    let chunks = (n / 8) as isize;
+    let nstore = n - (7 * chunks) as usize;
+    if nstore > NSTORE_THRESHOLD {
+        let mut result = 0;
+        sol_memcmp_(s1, s2, n as u64, &mut result as *mut i32);
+        return result;
+    }
+    let mut i: isize = 0;
+    if chunks != 0 {
+        let s1_64 = s1 as *const _ as *const u64;
+        let s2_64 = s2 as *const _ as *const u64;
+        while i < chunks {
+            let a = *s1_64.offset(i);
+            let b = *s2_64.offset(i);
+            if a != b {
+                break;
+            }
+            i += 1;
+        }
+        i *= 8;
+    }
+    while i < n as isize {
+        let a = *s1.offset(i);
+        let b = *s2.offset(i);
+        if a != b {
+            return a as i32 - b as i32;
+        }
+        i += 1;
+    }
+    0
+}
diff --git a/src/x86.rs b/src/x86.rs
index fd1f32e3..ad04d210 100644
--- a/src/x86.rs
+++ b/src/x86.rs
@@ -6,60 +6,28 @@ use core::intrinsics;
 // calling convention which can't be implemented using a normal Rust function
 
 // NOTE These functions are never mangled as they are not tested against compiler-rt
-// and mangling ___chkstk would break the `jmp ___chkstk` instruction in __alloca
 
 intrinsics! {
     #[naked]
     #[cfg(all(
-        windows,
-        target_env = "gnu",
+        any(all(windows, target_env = "gnu"), target_os = "uefi"),
         not(feature = "no-asm")
     ))]
-    pub unsafe extern "C" fn ___chkstk_ms() {
-        core::arch::asm!(
-            "push   %ecx",
-            "push   %eax",
-            "cmp    $0x1000,%eax",
-            "lea    12(%esp),%ecx",
-            "jb     1f",
-            "2:",
-            "sub    $0x1000,%ecx",
-            "test   %ecx,(%ecx)",
-            "sub    $0x1000,%eax",
-            "cmp    $0x1000,%eax",
-            "ja     2b",
-            "1:",
-            "sub    %eax,%ecx",
-            "test   %ecx,(%ecx)",
-            "pop    %eax",
-            "pop    %ecx",
-            "ret",
-            options(noreturn, att_syntax)
-        );
-    }
-
-    // FIXME: __alloca should be an alias to __chkstk
-    #[naked]
-    #[cfg(all(
-        windows,
-        target_env = "gnu",
-        not(feature = "no-asm")
-    ))]
-    pub unsafe extern "C" fn __alloca() {
-        core::arch::asm!(
-            "jmp ___chkstk", // Jump to ___chkstk since fallthrough may be unreliable"
-            options(noreturn, att_syntax)
+    pub unsafe extern "C" fn __chkstk() {
+        core::arch::naked_asm!(
+            "jmp __alloca", // Jump to __alloca since fallthrough may be unreliable"
+            options(att_syntax)
         );
     }
 
     #[naked]
     #[cfg(all(
-        windows,
-        target_env = "gnu",
+        any(all(windows, target_env = "gnu"), target_os = "uefi"),
         not(feature = "no-asm")
     ))]
-    pub unsafe extern "C" fn ___chkstk() {
-        core::arch::asm!(
+    pub unsafe extern "C" fn _alloca() {
+        // __chkstk and _alloca are the same function
+        core::arch::naked_asm!(
             "push   %ecx",
             "cmp    $0x1000,%eax",
             "lea    8(%esp),%ecx", // esp before calling this routine -> ecx
@@ -79,7 +47,7 @@ intrinsics! {
             "push   (%eax)",        // push return address onto the stack
             "sub    %esp,%eax",     // restore the original value in eax
             "ret",
-            options(noreturn, att_syntax)
+            options(att_syntax)
         );
     }
 }
diff --git a/src/x86_64.rs b/src/x86_64.rs
index 393eeddd..9c91a455 100644
--- a/src/x86_64.rs
+++ b/src/x86_64.rs
@@ -6,17 +6,15 @@ use core::intrinsics;
 // calling convention which can't be implemented using a normal Rust function
 
 // NOTE These functions are never mangled as they are not tested against compiler-rt
-// and mangling ___chkstk would break the `jmp ___chkstk` instruction in __alloca
 
 intrinsics! {
     #[naked]
     #[cfg(all(
-        windows,
-        target_env = "gnu",
+        any(all(windows, target_env = "gnu"), target_os = "uefi"),
         not(feature = "no-asm")
     ))]
     pub unsafe extern "C" fn ___chkstk_ms() {
-        core::arch::asm!(
+        core::arch::naked_asm!(
             "push   %rcx",
             "push   %rax",
             "cmp    $0x1000,%rax",
@@ -34,52 +32,7 @@ intrinsics! {
             "pop    %rax",
             "pop    %rcx",
             "ret",
-            options(noreturn, att_syntax)
-        );
-    }
-
-    #[naked]
-    #[cfg(all(
-        windows,
-        target_env = "gnu",
-        not(feature = "no-asm")
-    ))]
-    pub unsafe extern "C" fn __alloca() {
-        core::arch::asm!(
-            "mov    %rcx,%rax", // x64 _alloca is a normal function with parameter in rcx
-            "jmp    ___chkstk", // Jump to ___chkstk since fallthrough may be unreliable"
-            options(noreturn, att_syntax)
-        );
-    }
-
-    #[naked]
-    #[cfg(all(
-        windows,
-        target_env = "gnu",
-        not(feature = "no-asm")
-    ))]
-    pub unsafe extern "C" fn ___chkstk() {
-        core::arch::asm!(
-            "push   %rcx",
-            "cmp    $0x1000,%rax",
-            "lea    16(%rsp),%rcx", // rsp before calling this routine -> rcx
-            "jb     1f",
-            "2:",
-            "sub    $0x1000,%rcx",
-            "test   %rcx,(%rcx)",
-            "sub    $0x1000,%rax",
-            "cmp    $0x1000,%rax",
-            "ja     2b",
-            "1:",
-            "sub    %rax,%rcx",
-            "test   %rcx,(%rcx)",
-            "lea    8(%rsp),%rax",  // load pointer to the return address into rax
-            "mov    %rcx,%rsp",     // install the new top of stack pointer into rsp
-            "mov    -8(%rax),%rcx", // restore rcx
-            "push   (%rax)",        // push return address onto the stack
-            "sub    %rsp,%rax",     // restore the original value in rax
-            "ret",
-            options(noreturn, att_syntax)
+            options(att_syntax)
         );
     }
 }
diff --git a/testcrate/Cargo.toml b/testcrate/Cargo.toml
index 762d3293..21cec170 100644
--- a/testcrate/Cargo.toml
+++ b/testcrate/Cargo.toml
@@ -2,7 +2,8 @@
 name = "testcrate"
 version = "0.1.0"
 authors = ["Alex Crichton <alex@alexcrichton.com>"]
-edition = "2018"
+edition = "2021"
+publish = false
 
 [lib]
 test = false
@@ -13,12 +14,18 @@ doctest = false
 # problems with system RNGs on the variety of platforms this crate is tested on.
 # `xoshiro128**` is used for its quality, size, and speed at generating `u32` shift amounts.
 rand_xoshiro = "0.6"
+# To compare float builtins against
+rustc_apfloat = "0.2.1"
 
 [dependencies.compiler_builtins]
 path = ".."
 default-features = false
 features = ["public-test-deps"]
 
+[dev-dependencies]
+criterion = { version = "0.5.1", default-features = false, features = ["cargo_bench_support"] }
+paste = "1.0.15"
+
 [target.'cfg(all(target_arch = "arm", not(any(target_env = "gnu", target_env = "musl")), target_os = "linux"))'.dev-dependencies]
 test = { git = "https://github.com/japaric/utest" }
 utest-cortex-m-qemu = { default-features = false, git = "https://github.com/japaric/utest" }
@@ -28,5 +35,52 @@ utest-macros = { git = "https://github.com/japaric/utest" }
 default = ["mangled-names"]
 c = ["compiler_builtins/c"]
 no-asm = ["compiler_builtins/no-asm"]
+no-f16-f128 = ["compiler_builtins/no-f16-f128"]
 mem = ["compiler_builtins/mem"]
 mangled-names = ["compiler_builtins/mangled-names"]
+# Skip tests that rely on f128 symbols being available on the system
+no-sys-f128 = ["no-sys-f128-int-convert", "no-sys-f16-f128-convert"]
+# Some platforms have some f128 functions but everything except integer conversions
+no-sys-f128-int-convert = []
+no-sys-f16-f128-convert = []
+# Skip tests that rely on f16 symbols being available on the system
+no-sys-f16 = []
+
+# Enable report generation without bringing in more dependencies by default
+benchmarking-reports = ["criterion/plotters", "criterion/html_reports"]
+
+[[bench]]
+name = "float_add"
+harness = false
+
+[[bench]]
+name = "float_sub"
+harness = false
+
+[[bench]]
+name = "float_mul"
+harness = false
+
+[[bench]]
+name = "float_div"
+harness = false
+
+[[bench]]
+name = "float_cmp"
+harness = false
+
+[[bench]]
+name = "float_conv"
+harness = false
+
+[[bench]]
+name = "float_extend"
+harness = false
+
+[[bench]]
+name = "float_trunc"
+harness = false
+
+[[bench]]
+name = "float_pow"
+harness = false
diff --git a/testcrate/benches/float_add.rs b/testcrate/benches/float_add.rs
new file mode 100644
index 00000000..3311e7b5
--- /dev/null
+++ b/testcrate/benches/float_add.rs
@@ -0,0 +1,93 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use compiler_builtins::float::add;
+use criterion::{criterion_main, Criterion};
+use testcrate::float_bench;
+
+float_bench! {
+    name: add_f32,
+    sig: (a: f32, b: f32) -> f32,
+    crate_fn: add::__addsf3,
+    sys_fn: __addsf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "addss {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fadd {a:s}, {a:s}, {b:s}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+float_bench! {
+    name: add_f64,
+    sig: (a: f64, b: f64) -> f64,
+    crate_fn: add::__adddf3,
+    sys_fn: __adddf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "addsd {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fadd {a:d}, {a:d}, {b:d}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: add_f128,
+    sig: (a: f128, b: f128) -> f128,
+    crate_fn: add::__addtf3,
+    crate_fn_ppc: add::__addkf3,
+    sys_fn: __addtf3,
+    sys_fn_ppc: __addkf3,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_add() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    add_f32(&mut criterion);
+    add_f64(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        add_f128(&mut criterion);
+    }
+}
+
+criterion_main!(float_add);
diff --git a/testcrate/benches/float_cmp.rs b/testcrate/benches/float_cmp.rs
new file mode 100644
index 00000000..400c09b4
--- /dev/null
+++ b/testcrate/benches/float_cmp.rs
@@ -0,0 +1,208 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use criterion::{criterion_main, Criterion};
+use testcrate::float_bench;
+
+use compiler_builtins::float::cmp;
+
+/// `gt` symbols are allowed to return differing results, they just get compared
+/// to 0.
+fn gt_res_eq(a: i32, b: i32) -> bool {
+    let a_lt_0 = a <= 0;
+    let b_lt_0 = b <= 0;
+    (a_lt_0 && b_lt_0) || (!a_lt_0 && !b_lt_0)
+}
+
+float_bench! {
+    name: cmp_f32_gt,
+    sig: (a: f32, b: f32) -> i32,
+    crate_fn: cmp::__gtsf2,
+    sys_fn: __gtsf2,
+    sys_available: all(),
+    output_eq: gt_res_eq,
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: i32;
+            asm!(
+                "xor     {ret:e}, {ret:e}",
+                "ucomiss {a}, {b}",
+                "seta    {ret:l}",
+                a = in(xmm_reg) a,
+                b = in(xmm_reg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcmp    {a:s}, {b:s}",
+                "cset    {ret:w}, gt",
+                a = in(vreg) a,
+                b = in(vreg) b,
+                ret = out(reg) ret,
+                options(nomem,nostack),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: cmp_f32_unord,
+    sig: (a: f32, b: f32) -> i32,
+    crate_fn: cmp::__unordsf2,
+    sys_fn: __unordsf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: i32;
+            asm!(
+                "xor     {ret:e}, {ret:e}",
+                "ucomiss {a}, {b}",
+                "setp    {ret:l}",
+                a = in(xmm_reg) a,
+                b = in(xmm_reg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcmp    {a:s}, {b:s}",
+                "cset    {ret:w}, vs",
+                a = in(vreg) a,
+                b = in(vreg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: cmp_f64_gt,
+    sig: (a: f64, b: f64) -> i32,
+    crate_fn: cmp::__gtdf2,
+    sys_fn: __gtdf2,
+    sys_available: all(),
+    output_eq: gt_res_eq,
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: i32;
+            asm!(
+                "xor     {ret:e}, {ret:e}",
+                "ucomisd {a}, {b}",
+                "seta    {ret:l}",
+                a = in(xmm_reg) a,
+                b = in(xmm_reg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcmp    {a:d}, {b:d}",
+                "cset {ret:w}, gt",
+                a = in(vreg) a,
+                b = in(vreg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: cmp_f64_unord,
+    sig: (a: f64, b: f64) -> i32,
+    crate_fn: cmp::__unorddf2,
+    sys_fn: __unorddf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: i32;
+            asm!(
+                "xor     {ret:e}, {ret:e}",
+                "ucomisd {a}, {b}",
+                "setp    {ret:l}",
+                a = in(xmm_reg) a,
+                b = in(xmm_reg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcmp    {a:d}, {b:d}",
+                "cset    {ret:w}, vs",
+                a = in(vreg) a,
+                b = in(vreg) b,
+                ret = out(reg) ret,
+                options(nomem, nostack, pure)
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: cmp_f128_gt,
+    sig: (a: f128, b: f128) -> i32,
+    crate_fn: cmp::__gttf2,
+    crate_fn_ppc: cmp::__gtkf2,
+    sys_fn: __gttf2,
+    sys_fn_ppc: __gtkf2,
+    sys_available: not(feature = "no-sys-f128"),
+    output_eq: gt_res_eq,
+    asm: []
+}
+
+float_bench! {
+    name: cmp_f128_unord,
+    sig: (a: f128, b: f128) -> i32,
+    crate_fn: cmp::__unordtf2,
+    crate_fn_ppc: cmp::__unordkf2,
+    sys_fn: __unordtf2,
+    sys_fn_ppc: __unordkf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_cmp() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    cmp_f32_gt(&mut criterion);
+    cmp_f32_unord(&mut criterion);
+    cmp_f64_gt(&mut criterion);
+    cmp_f64_unord(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        cmp_f128_gt(&mut criterion);
+        cmp_f128_unord(&mut criterion);
+    }
+}
+
+criterion_main!(float_cmp);
diff --git a/testcrate/benches/float_conv.rs b/testcrate/benches/float_conv.rs
new file mode 100644
index 00000000..de2043b0
--- /dev/null
+++ b/testcrate/benches/float_conv.rs
@@ -0,0 +1,546 @@
+#![allow(improper_ctypes)]
+
+use compiler_builtins::float::conv;
+use criterion::{criterion_group, criterion_main, Criterion};
+use testcrate::float_bench;
+
+/* unsigned int -> float */
+
+float_bench! {
+    name: conv_u32_f32,
+    sig: (a: u32) -> f32,
+    crate_fn: conv::__floatunsisf,
+    sys_fn: __floatunsisf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f32;
+            asm!(
+                "mov {tmp:e}, {a:e}",
+                "cvtsi2ss {ret}, {tmp}",
+                a = in(reg) a,
+                tmp = out(reg) _,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "ucvtf {ret:s}, {a:w}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_u32_f64,
+    sig: (a: u32) -> f64,
+    crate_fn: conv::__floatunsidf,
+    sys_fn: __floatunsidf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f64;
+            asm!(
+                "mov {tmp:e}, {a:e}",
+                "cvtsi2sd {ret}, {tmp}",
+                a = in(reg) a,
+                tmp = out(reg) _,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "ucvtf {ret:d}, {a:w}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_u64_f32,
+    sig: (a: u64) -> f32,
+    crate_fn: conv::__floatundisf,
+    sys_fn: __floatundisf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "ucvtf {ret:s}, {a:x}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_u64_f64,
+    sig: (a: u64) -> f64,
+    crate_fn: conv::__floatundidf,
+    sys_fn: __floatundidf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "ucvtf {ret:d}, {a:x}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_u128_f32,
+    sig: (a: u128) -> f32,
+    crate_fn: conv::__floatuntisf,
+    sys_fn: __floatuntisf,
+    sys_available: all(),
+    asm: []
+}
+
+float_bench! {
+    name: conv_u128_f64,
+    sig: (a: u128) -> f64,
+    crate_fn: conv::__floatuntidf,
+    sys_fn: __floatuntidf,
+    sys_available: all(),
+    asm: []
+}
+
+/* signed int -> float */
+
+float_bench! {
+    name: conv_i32_f32,
+    sig: (a: i32) -> f32,
+    crate_fn: conv::__floatsisf,
+    sys_fn: __floatsisf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f32;
+            asm!(
+                "cvtsi2ss    {ret}, {a:e}",
+                a = in(reg) a,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "scvtf {ret:s}, {a:w}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_i32_f64,
+    sig: (a: i32) -> f64,
+    crate_fn: conv::__floatsidf,
+    sys_fn: __floatsidf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f64;
+            asm!(
+                "cvtsi2sd    {ret}, {a:e}",
+                a = in(reg) a,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "scvtf {ret:d}, {a:w}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_i64_f32,
+    sig: (a: i64) -> f32,
+    crate_fn: conv::__floatdisf,
+    sys_fn: __floatdisf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f32;
+            asm!(
+                "cvtsi2ss    {ret}, {a:r}",
+                a = in(reg) a,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "scvtf {ret:s}, {a:x}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_i64_f64,
+    sig: (a: i64) -> f64,
+    crate_fn: conv::__floatdidf,
+    sys_fn: __floatdidf,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f64;
+            asm!(
+                "cvtsi2sd    {ret}, {a:r}",
+                a = in(reg) a,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "scvtf {ret:d}, {a:x}",
+                a = in(reg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_i128_f32,
+    sig: (a: i128) -> f32,
+    crate_fn: conv::__floattisf,
+    sys_fn: __floattisf,
+    sys_available: all(),
+    asm: []
+}
+
+float_bench! {
+    name: conv_i128_f64,
+    sig: (a: i128) -> f64,
+    crate_fn: conv::__floattidf,
+    sys_fn: __floattidf,
+    sys_available: all(),
+    asm: []
+}
+
+/* float -> unsigned int */
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_u32,
+    sig: (a: f32) -> u32,
+    crate_fn: conv::__fixunssfsi,
+    sys_fn: __fixunssfsi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: u32;
+            asm!(
+                "fcvtzu {ret:w}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_u64,
+    sig: (a: f32) -> u64,
+    crate_fn: conv::__fixunssfdi,
+    sys_fn: __fixunssfdi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: u64;
+            asm!(
+                "fcvtzu {ret:x}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_u128,
+    sig: (a: f32) -> u128,
+    crate_fn: conv::__fixunssfti,
+    sys_fn: __fixunssfti,
+    sys_available: all(),
+    asm: []
+}
+
+float_bench! {
+    name: conv_f64_u32,
+    sig: (a: f64) -> u32,
+    crate_fn: conv::__fixunsdfsi,
+    sys_fn: __fixunsdfsi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: u32;
+            asm!(
+                "fcvtzu {ret:w}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_f64_u64,
+    sig: (a: f64) -> u64,
+    crate_fn: conv::__fixunsdfdi,
+    sys_fn: __fixunsdfdi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: u64;
+            asm!(
+                "fcvtzu {ret:x}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_f64_u128,
+    sig: (a: f64) -> u128,
+    crate_fn: conv::__fixunsdfti,
+    sys_fn: __fixunsdfti,
+    sys_available: all(),
+    asm: []
+}
+
+/* float -> signed int */
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_i32,
+    sig: (a: f32) -> i32,
+    crate_fn: conv::__fixsfsi,
+    sys_fn: __fixsfsi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcvtzs {ret:w}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_i64,
+    sig: (a: f32) -> i64,
+    crate_fn: conv::__fixsfdi,
+    sys_fn: __fixsfdi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i64;
+            asm!(
+                "fcvtzs {ret:x}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+float_bench! {
+    name: conv_f32_i128,
+    sig: (a: f32) -> i128,
+    crate_fn: conv::__fixsfti,
+    sys_fn: __fixsfti,
+    sys_available: all(),
+    asm: []
+}
+
+float_bench! {
+    name: conv_f64_i32,
+    sig: (a: f64) -> i32,
+    crate_fn: conv::__fixdfsi,
+    sys_fn: __fixdfsi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i32;
+            asm!(
+                "fcvtzs {ret:w}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_f64_i64,
+    sig: (a: f64) -> i64,
+    crate_fn: conv::__fixdfdi,
+    sys_fn: __fixdfdi,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: i64;
+            asm!(
+                "fcvtzs {ret:x}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: conv_f64_i128,
+    sig: (a: f64) -> i128,
+    crate_fn: conv::__fixdfti,
+    sys_fn: __fixdfti,
+    sys_available: all(),
+    asm: []
+}
+
+criterion_group!(
+    float_conv,
+    conv_u32_f32,
+    conv_u32_f64,
+    conv_u64_f32,
+    conv_u64_f64,
+    conv_u128_f32,
+    conv_u128_f64,
+    conv_i32_f32,
+    conv_i32_f64,
+    conv_i64_f32,
+    conv_i64_f64,
+    conv_i128_f32,
+    conv_i128_f64,
+    conv_f64_u32,
+    conv_f64_u64,
+    conv_f64_u128,
+    conv_f64_i32,
+    conv_f64_i64,
+    conv_f64_i128,
+);
+
+// FIXME: ppc64le has a sporadic overflow panic in the crate functions
+// <https://github.com/rust-lang/compiler-builtins/issues/617#issuecomment-2125914639>
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+criterion_group!(
+    float_conv_not_ppc64le,
+    conv_f32_u32,
+    conv_f32_u64,
+    conv_f32_u128,
+    conv_f32_i32,
+    conv_f32_i64,
+    conv_f32_i128,
+);
+
+#[cfg(all(target_arch = "powerpc64", target_endian = "little"))]
+criterion_main!(float_conv);
+
+#[cfg(not(all(target_arch = "powerpc64", target_endian = "little")))]
+criterion_main!(float_conv, float_conv_not_ppc64le);
diff --git a/testcrate/benches/float_div.rs b/testcrate/benches/float_div.rs
new file mode 100644
index 00000000..6a039a82
--- /dev/null
+++ b/testcrate/benches/float_div.rs
@@ -0,0 +1,93 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use compiler_builtins::float::div;
+use criterion::{criterion_main, Criterion};
+use testcrate::float_bench;
+
+float_bench! {
+    name: div_f32,
+    sig: (a: f32, b: f32) -> f32,
+    crate_fn: div::__divsf3,
+    sys_fn: __divsf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "divss {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fdiv {a:s}, {a:s}, {b:s}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+float_bench! {
+    name: div_f64,
+    sig: (a: f64, b: f64) -> f64,
+    crate_fn: div::__divdf3,
+    sys_fn: __divdf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "divsd {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fdiv {a:d}, {a:d}, {b:d}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: div_f128,
+    sig: (a: f128, b: f128) -> f128,
+    crate_fn: div::__divtf3,
+    crate_fn_ppc: div::__divkf3,
+    sys_fn: __divtf3,
+    sys_fn_ppc: __divkf3,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_div() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    div_f32(&mut criterion);
+    div_f64(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        div_f128(&mut criterion);
+    }
+}
+
+criterion_main!(float_div);
diff --git a/testcrate/benches/float_extend.rs b/testcrate/benches/float_extend.rs
new file mode 100644
index 00000000..a0cdaf48
--- /dev/null
+++ b/testcrate/benches/float_extend.rs
@@ -0,0 +1,110 @@
+#![allow(unused_variables)] // "unused" f16 registers
+#![cfg_attr(f128_enabled, feature(f128))]
+#![cfg_attr(f16_enabled, feature(f16))]
+
+use compiler_builtins::float::extend;
+use criterion::{criterion_main, Criterion};
+use testcrate::float_bench;
+
+#[cfg(f16_enabled)]
+float_bench! {
+    name: extend_f16_f32,
+    sig: (a: f16) -> f32,
+    crate_fn: extend::__extendhfsf2,
+    sys_fn: __extendhfsf2,
+    sys_available: not(feature = "no-sys-f16"),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "fcvt    {ret:s}, {a:h}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(all(f16_enabled, f128_enabled))]
+float_bench! {
+    name: extend_f16_f128,
+    sig: (a: f16) -> f128,
+    crate_fn: extend::__extendhftf2,
+    crate_fn_ppc: extend::__extendhfkf2,
+    sys_fn: __extendhftf2,
+    sys_fn_ppc: __extendhfkf2,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: [],
+}
+
+float_bench! {
+    name: extend_f32_f64,
+    sig: (a: f32) -> f64,
+    crate_fn: extend::__extendsfdf2,
+    sys_fn: __extendsfdf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f64;
+            asm!(
+                "fcvt    {ret:d}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: extend_f32_f128,
+    sig: (a: f32) -> f128,
+    crate_fn: extend::__extendsftf2,
+    crate_fn_ppc: extend::__extendsfkf2,
+    sys_fn: __extendsftf2,
+    sys_fn_ppc: __extendsfkf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: extend_f64_f128,
+    sig: (a: f64) -> f128,
+    crate_fn: extend::__extenddftf2,
+    crate_fn_ppc: extend::__extenddfkf2,
+    sys_fn: __extenddftf2,
+    sys_fn_ppc: __extenddfkf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+pub fn float_extend() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
+    #[cfg(f16_enabled)]
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    {
+        extend_f16_f32(&mut criterion);
+
+        #[cfg(f128_enabled)]
+        extend_f16_f128(&mut criterion);
+    }
+
+    extend_f32_f64(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        extend_f32_f128(&mut criterion);
+        extend_f64_f128(&mut criterion);
+    }
+}
+
+criterion_main!(float_extend);
diff --git a/testcrate/benches/float_mul.rs b/testcrate/benches/float_mul.rs
new file mode 100644
index 00000000..6e30b786
--- /dev/null
+++ b/testcrate/benches/float_mul.rs
@@ -0,0 +1,93 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use compiler_builtins::float::mul;
+use criterion::{criterion_main, Criterion};
+use testcrate::float_bench;
+
+float_bench! {
+    name: mul_f32,
+    sig: (a: f32, b: f32) -> f32,
+    crate_fn: mul::__mulsf3,
+    sys_fn: __mulsf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "mulss {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fmul {a:s}, {a:s}, {b:s}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+float_bench! {
+    name: mul_f64,
+    sig: (a: f64, b: f64) -> f64,
+    crate_fn: mul::__muldf3,
+    sys_fn: __muldf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "mulsd {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fmul {a:d}, {a:d}, {b:d}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: mul_f128,
+    sig: (a: f128, b: f128) -> f128,
+    crate_fn: mul::__multf3,
+    crate_fn_ppc: mul::__mulkf3,
+    sys_fn: __multf3,
+    sys_fn_ppc: __mulkf3,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_mul() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    mul_f32(&mut criterion);
+    mul_f64(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        mul_f128(&mut criterion);
+    }
+}
+
+criterion_main!(float_mul);
diff --git a/testcrate/benches/float_pow.rs b/testcrate/benches/float_pow.rs
new file mode 100644
index 00000000..46da3f25
--- /dev/null
+++ b/testcrate/benches/float_pow.rs
@@ -0,0 +1,49 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use compiler_builtins::float::pow;
+use criterion::{criterion_main, Criterion};
+use testcrate::float_bench;
+
+float_bench! {
+    name: powi_f32,
+    sig: (a: f32, b: i32) -> f32,
+    crate_fn: pow::__powisf2,
+    sys_fn: __powisf2,
+    sys_available: all(),
+    asm: [],
+}
+
+float_bench! {
+    name: powi_f64,
+    sig: (a: f64, b: i32) -> f64,
+    crate_fn: pow::__powidf2,
+    sys_fn: __powidf2,
+    sys_available: all(),
+    asm: [],
+}
+
+// FIXME(f16_f128): can be changed to only `f128_enabled` once `__multf3` and `__divtf3` are
+// distributed by nightly.
+#[cfg(all(f128_enabled, not(feature = "no-sys-f128")))]
+float_bench! {
+    name: powi_f128,
+    sig: (a: f128, b: i32) -> f128,
+    crate_fn: pow::__powitf2,
+    crate_fn_ppc: pow::__powikf2,
+    sys_fn: __powitf2,
+    sys_fn_ppc: __powikf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_pow() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    powi_f32(&mut criterion);
+    powi_f64(&mut criterion);
+
+    #[cfg(all(f128_enabled, not(feature = "no-sys-f128")))]
+    powi_f128(&mut criterion);
+}
+
+criterion_main!(float_pow);
diff --git a/testcrate/benches/float_sub.rs b/testcrate/benches/float_sub.rs
new file mode 100644
index 00000000..cdb678ee
--- /dev/null
+++ b/testcrate/benches/float_sub.rs
@@ -0,0 +1,93 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+
+use compiler_builtins::float::sub;
+use criterion::{criterion_main, Criterion};
+use testcrate::float_bench;
+
+float_bench! {
+    name: sub_f32,
+    sig: (a: f32, b: f32) -> f32,
+    crate_fn: sub::__subsf3,
+    sys_fn: __subsf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "subss {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fsub {a:s}, {a:s}, {b:s}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+float_bench! {
+    name: sub_f64,
+    sig: (a: f64, b: f64) -> f64,
+    crate_fn: sub::__subdf3,
+    sys_fn: __subdf3,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            asm!(
+                "subsd {a}, {b}",
+                a = inout(xmm_reg) a,
+                b = in(xmm_reg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            asm!(
+                "fsub {a:d}, {a:d}, {b:d}",
+                a = inout(vreg) a,
+                b = in(vreg) b,
+                options(nomem, nostack, pure)
+            );
+
+            a
+        };
+    ],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: sub_f128,
+    sig: (a: f128, b: f128) -> f128,
+    crate_fn: sub::__subtf3,
+    crate_fn_ppc: sub::__subkf3,
+    sys_fn: __subtf3,
+    sys_fn_ppc: __subkf3,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: []
+}
+
+pub fn float_sub() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    sub_f32(&mut criterion);
+    sub_f64(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        sub_f128(&mut criterion);
+    }
+}
+
+criterion_main!(float_sub);
diff --git a/testcrate/benches/float_trunc.rs b/testcrate/benches/float_trunc.rs
new file mode 100644
index 00000000..de9b5bf8
--- /dev/null
+++ b/testcrate/benches/float_trunc.rs
@@ -0,0 +1,146 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+#![cfg_attr(f16_enabled, feature(f16))]
+
+use compiler_builtins::float::trunc;
+use criterion::{criterion_main, Criterion};
+use testcrate::float_bench;
+
+#[cfg(f16_enabled)]
+float_bench! {
+    name: trunc_f32_f16,
+    sig: (a: f32) -> f16,
+    crate_fn: trunc::__truncsfhf2,
+    sys_fn: __truncsfhf2,
+    sys_available: not(feature = "no-sys-f16"),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f16;
+            asm!(
+                "fcvt    {ret:h}, {a:s}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(f16_enabled)]
+float_bench! {
+    name: trunc_f64_f16,
+    sig: (a: f64) -> f16,
+    crate_fn: trunc::__truncdfhf2,
+    sys_fn: __truncdfhf2,
+    sys_available: not(feature = "no-sys-f16"),
+    asm: [
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f16;
+            asm!(
+                "fcvt    {ret:h}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+float_bench! {
+    name: trunc_f64_f32,
+    sig: (a: f64) -> f32,
+    crate_fn: trunc::__truncdfsf2,
+    sys_fn: __truncdfsf2,
+    sys_available: all(),
+    asm: [
+        #[cfg(target_arch = "x86_64")] {
+            let ret: f32;
+            asm!(
+                "cvtsd2ss {ret}, {a}",
+                a = in(xmm_reg) a,
+                ret = lateout(xmm_reg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+
+        #[cfg(target_arch = "aarch64")] {
+            let ret: f32;
+            asm!(
+                "fcvt    {ret:s}, {a:d}",
+                a = in(vreg) a,
+                ret = lateout(vreg) ret,
+                options(nomem, nostack, pure),
+            );
+
+            ret
+        };
+    ],
+}
+
+#[cfg(all(f16_enabled, f128_enabled))]
+float_bench! {
+    name: trunc_f128_f16,
+    sig: (a: f128) -> f16,
+    crate_fn: trunc::__trunctfhf2,
+    crate_fn_ppc: trunc::__trunckfhf2,
+    sys_fn: __trunctfhf2,
+    sys_fn_ppc: __trunckfhf2,
+    sys_available: not(feature = "no-sys-f16-f128-convert"),
+    asm: [],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: trunc_f128_f32,
+    sig: (a: f128) -> f32,
+    crate_fn: trunc::__trunctfsf2,
+    crate_fn_ppc: trunc::__trunckfsf2,
+    sys_fn: __trunctfsf2,
+    sys_fn_ppc: __trunckfsf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+#[cfg(f128_enabled)]
+float_bench! {
+    name: trunc_f128_f64,
+    sig: (a: f128) -> f64,
+    crate_fn: trunc::__trunctfdf2,
+    crate_fn_ppc: trunc::__trunckfdf2,
+    sys_fn: __trunctfdf2,
+    sys_fn_ppc: __trunckfdf2,
+    sys_available: not(feature = "no-sys-f128"),
+    asm: [],
+}
+
+pub fn float_trunc() {
+    let mut criterion = Criterion::default().configure_from_args();
+
+    // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
+    #[cfg(f16_enabled)]
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    {
+        trunc_f32_f16(&mut criterion);
+        trunc_f64_f16(&mut criterion);
+    }
+
+    trunc_f64_f32(&mut criterion);
+
+    #[cfg(f128_enabled)]
+    {
+        // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
+        #[cfg(f16_enabled)]
+        #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+        trunc_f128_f16(&mut criterion);
+
+        trunc_f128_f32(&mut criterion);
+        trunc_f128_f64(&mut criterion);
+    }
+}
+
+criterion_main!(float_trunc);
diff --git a/testcrate/build.rs b/testcrate/build.rs
new file mode 100644
index 00000000..6205c7ac
--- /dev/null
+++ b/testcrate/build.rs
@@ -0,0 +1,90 @@
+use std::collections::HashSet;
+
+/// Features to enable
+#[derive(Debug, PartialEq, Eq, Hash)]
+enum Feature {
+    NoSysF128,
+    NoSysF128IntConvert,
+    NoSysF16,
+    NoSysF16F128Convert,
+}
+
+mod builtins_configure {
+    include!("../configure.rs");
+}
+
+fn main() {
+    println!("cargo::rerun-if-changed=../configure.rs");
+
+    let target = builtins_configure::Target::from_env();
+    let mut features = HashSet::new();
+
+    // These platforms do not have f128 symbols available in their system libraries, so
+    // skip related tests.
+    if target.arch == "arm"
+        || target.vendor == "apple"
+        || target.env == "msvc"
+        // GCC and LLVM disagree on the ABI of `f16` and `f128` with MinGW. See
+        // <https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115054>.
+        || (target.os == "windows" && target.env == "gnu")
+        // FIXME(llvm): There is an ABI incompatibility between GCC and Clang on 32-bit x86.
+        // See <https://github.com/llvm/llvm-project/issues/77401>.
+        || target.arch == "x86"
+        // 32-bit PowerPC and 64-bit LE gets code generated that Qemu cannot handle. See
+        // <https://github.com/rust-lang/compiler-builtins/pull/606#issuecomment-2105635926>.
+        || target.arch == "powerpc"
+        || target.arch == "powerpc64le"
+        // FIXME: We get different results from the builtin functions. See
+        // <https://github.com/rust-lang/compiler-builtins/pull/606#issuecomment-2105657287>.
+        || target.arch == "powerpc64"
+    {
+        features.insert(Feature::NoSysF128);
+        features.insert(Feature::NoSysF128IntConvert);
+        features.insert(Feature::NoSysF16F128Convert);
+    }
+
+    if target.arch == "x86" {
+        // 32-bit x86 does not have `__fixunstfti`/`__fixtfti` but does have everything else
+        features.insert(Feature::NoSysF128IntConvert);
+        // FIXME: 32-bit x86 has a bug in `f128 -> f16` system libraries
+        features.insert(Feature::NoSysF16F128Convert);
+    }
+
+    // These platforms do not have f16 symbols available in their system libraries, so
+    // skip related tests. Most of these are missing `f16 <-> f32` conversion routines.
+    if (target.arch == "aarch64" && target.os == "linux")
+        || target.arch.starts_with("arm")
+        || target.arch == "powerpc"
+        || target.arch == "powerpc64"
+        || target.arch == "powerpc64le"
+        || (target.arch == "x86" && !target.has_feature("sse"))
+        || target.os == "windows"
+        // Linking says "error: function signature mismatch: __extendhfsf2" and seems to
+        // think the signature is either `(i32) -> f32` or `(f32) -> f32`. See
+        // <https://github.com/llvm/llvm-project/issues/96438>.
+        || target.arch == "wasm32"
+        || target.arch == "wasm64"
+    {
+        features.insert(Feature::NoSysF16);
+        features.insert(Feature::NoSysF16F128Convert);
+    }
+
+    for feature in features {
+        let (name, warning) = match feature {
+            Feature::NoSysF128 => ("no-sys-f128", "using apfloat fallback for f128"),
+            Feature::NoSysF128IntConvert => (
+                "no-sys-f128-int-convert",
+                "using apfloat fallback for f128 <-> int conversions",
+            ),
+            Feature::NoSysF16F128Convert => (
+                "no-sys-f16-f128-convert",
+                "using apfloat fallback for f16 <-> f128 conversions",
+            ),
+            Feature::NoSysF16 => ("no-sys-f16", "using apfloat fallback for f16"),
+        };
+        println!("cargo:warning={warning}");
+        println!("cargo:rustc-cfg=feature=\"{name}\"");
+    }
+
+    builtins_configure::configure_f16_f128(&target);
+}
diff --git a/testcrate/src/bench.rs b/testcrate/src/bench.rs
new file mode 100644
index 00000000..f5da1f3a
--- /dev/null
+++ b/testcrate/src/bench.rs
@@ -0,0 +1,361 @@
+use core::cell::RefCell;
+
+use alloc::vec::Vec;
+use compiler_builtins::float::Float;
+
+/// Fuzz with these many items to ensure equal functions
+pub const CHECK_ITER_ITEMS: u32 = 10_000;
+/// Benchmark with this many items to get a variety
+pub const BENCH_ITER_ITEMS: u32 = 500;
+
+/// Still run benchmarks/tests but don't check correctness between compiler-builtins and
+/// builtin system functions functions
+pub fn skip_sys_checks(test_name: &str) -> bool {
+    const ALWAYS_SKIPPED: &[&str] = &[
+        // FIXME(f16_f128): system symbols have incorrect results
+        // <https://github.com/rust-lang/compiler-builtins/issues/617>
+        "extend_f16_f32",
+        "trunc_f32_f16",
+        "trunc_f64_f16",
+        // FIXME(#616): re-enable once fix is in nightly
+        // <https://github.com/rust-lang/compiler-builtins/issues/616>
+        "mul_f32",
+        "mul_f64",
+    ];
+
+    // FIXME(f16_f128): error on LE ppc64. There are more tests that are cfg-ed out completely
+    // in their benchmark modules due to runtime panics.
+    // <https://github.com/rust-lang/compiler-builtins/issues/617#issuecomment-2125914639>
+    const PPC64LE_SKIPPED: &[&str] = &["extend_f32_f128"];
+
+    // FIXME(f16_f128): system symbols have incorrect results
+    // <https://github.com/rust-lang/compiler-builtins/issues/617#issuecomment-2125914639>
+    const X86_NO_SSE_SKIPPED: &[&str] = &[
+        "add_f128", "sub_f128", "mul_f128", "div_f128", "powi_f32", "powi_f64",
+    ];
+
+    // FIXME(f16_f128): Wide multiply carry bug in `compiler-rt`, re-enable when nightly no longer
+    // uses `compiler-rt` version.
+    // <https://github.com/llvm/llvm-project/issues/91840>
+    const AARCH64_SKIPPED: &[&str] = &["mul_f128", "div_f128"];
+
+    // FIXME(llvm): system symbols have incorrect results on Windows
+    // <https://github.com/rust-lang/compiler-builtins/issues/617#issuecomment-2121359807>
+    const WINDOWS_SKIPPED: &[&str] = &[
+        "conv_f32_u128",
+        "conv_f32_i128",
+        "conv_f64_u128",
+        "conv_f64_i128",
+    ];
+
+    if cfg!(target_arch = "arm") {
+        // The Arm symbols need a different ABI that our macro doesn't handle, just skip it
+        return true;
+    }
+
+    if ALWAYS_SKIPPED.contains(&test_name) {
+        return true;
+    }
+
+    if cfg!(all(target_arch = "powerpc64", target_endian = "little"))
+        && PPC64LE_SKIPPED.contains(&test_name)
+    {
+        return true;
+    }
+
+    if cfg!(all(target_arch = "x86", not(target_feature = "sse")))
+        && X86_NO_SSE_SKIPPED.contains(&test_name)
+    {
+        return true;
+    }
+
+    if cfg!(target_arch = "aarch64") && AARCH64_SKIPPED.contains(&test_name) {
+        return true;
+    }
+
+    if cfg!(target_family = "windows") && WINDOWS_SKIPPED.contains(&test_name) {
+        return true;
+    }
+
+    false
+}
+
+/// Still run benchmarks/tests but don't check correctness between compiler-builtins and
+/// assembly functions
+pub fn skip_asm_checks(_test_name: &str) -> bool {
+    // Nothing to skip at this time
+    false
+}
+
+/// Create a comparison of the system symbol, compiler_builtins, and optionally handwritten
+/// assembly.
+#[macro_export]
+macro_rules! float_bench {
+    (
+        // Name of this benchmark
+        name: $name:ident,
+        // The function signature to be tested
+        sig: ($($arg:ident: $arg_ty:ty),*) -> $ret_ty:ty,
+        // Path to the crate in compiler_builtins
+        crate_fn: $crate_fn:path,
+        // Optional alias on ppc
+        $( crate_fn_ppc: $crate_fn_ppc:path, )?
+        // Name of the system symbol
+        sys_fn: $sys_fn:ident,
+        // Optional alias on ppc
+        $( sys_fn_ppc: $sys_fn_ppc:path, )?
+        // Meta saying whether the system symbol is available
+        sys_available: $sys_available:meta,
+        // An optional function to validate the results of two functions are equal, if not
+        // just `$ret_ty::check_eq`
+        $( output_eq: $output_eq:expr, )?
+        // Assembly implementations, if any.
+        asm: [
+            $(
+                #[cfg($asm_meta:meta)] {
+                    $($asm_tt:tt)*
+                }
+            );*
+            $(;)?
+        ]
+        $(,)?
+    ) => {paste::paste! {
+        #[cfg($sys_available)]
+        extern "C" {
+            /// Binding for the system function
+            #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+            fn $sys_fn($($arg: $arg_ty),*) -> $ret_ty;
+
+
+            #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+            float_bench! { @coalesce_fn $($sys_fn_ppc)? =>
+                fn $sys_fn($($arg: $arg_ty),*) -> $ret_ty;
+            }
+        }
+
+        fn $name(c: &mut Criterion) {
+            use core::hint::black_box;
+            use compiler_builtins::float::Float;
+            use $crate::bench::TestIO;
+
+            #[inline(never)] // equalize with external calls
+            fn crate_fn($($arg: $arg_ty),*) -> $ret_ty {
+                #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+                let target_crate_fn = $crate_fn;
+
+                // On PPC, use an alias if specified
+                #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+                let target_crate_fn = float_bench!(@coalesce $($crate_fn_ppc)?, $crate_fn);
+
+                target_crate_fn( $($arg),* )
+            }
+
+            #[inline(always)] // already a branch
+            #[cfg($sys_available)]
+            fn sys_fn($($arg: $arg_ty),*) -> $ret_ty {
+                #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+                let target_sys_fn = $sys_fn;
+
+                // On PPC, use an alias if specified
+                #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+                let target_sys_fn = float_bench!(@coalesce $($sys_fn_ppc)?, $sys_fn);
+
+                unsafe { target_sys_fn( $($arg),* ) }
+            }
+
+            #[inline(never)] // equalize with external calls
+            #[cfg(any( $($asm_meta),* ))]
+            fn asm_fn($(mut $arg: $arg_ty),*) -> $ret_ty {
+                use core::arch::asm;
+                $(
+                    #[cfg($asm_meta)]
+                    unsafe { $($asm_tt)* }
+                )*
+            }
+
+            let testvec = <($($arg_ty),*)>::make_testvec($crate::bench::CHECK_ITER_ITEMS);
+            let benchvec = <($($arg_ty),*)>::make_testvec($crate::bench::BENCH_ITER_ITEMS);
+            let test_name = stringify!($name);
+            let check_eq = float_bench!(@coalesce $($output_eq)?, $ret_ty::check_eq);
+
+            // Verify math lines up. We run the crate functions even if we don't validate the
+            // output here to make sure there are no panics or crashes.
+
+            #[cfg($sys_available)]
+            for ($($arg),*) in testvec.iter().copied() {
+                let crate_res = crate_fn($($arg),*);
+                let sys_res = sys_fn($($arg),*);
+
+                if $crate::bench::skip_sys_checks(test_name) {
+                    continue;
+                }
+
+                assert!(
+                    check_eq(crate_res, sys_res),
+                    "{test_name}{:?}: crate: {crate_res:?}, sys: {sys_res:?}",
+                    ($($arg),* ,)
+                );
+            }
+
+            #[cfg(any( $($asm_meta),* ))]
+            {
+                for ($($arg),*) in testvec.iter().copied() {
+                    let crate_res = crate_fn($($arg),*);
+                    let asm_res = asm_fn($($arg),*);
+
+                    if $crate::bench::skip_asm_checks(test_name) {
+                        continue;
+                    }
+
+                    assert!(
+                        check_eq(crate_res, asm_res),
+                        "{test_name}{:?}: crate: {crate_res:?}, asm: {asm_res:?}",
+                        ($($arg),* ,)
+                    );
+                }
+            }
+
+            let mut group = c.benchmark_group(test_name);
+            group.bench_function("compiler-builtins", |b| b.iter(|| {
+                for ($($arg),*) in benchvec.iter().copied() {
+                    black_box(crate_fn( $(black_box($arg)),* ));
+                }
+            }));
+
+            #[cfg($sys_available)]
+            group.bench_function("system", |b| b.iter(|| {
+                for ($($arg),*) in benchvec.iter().copied() {
+                    black_box(sys_fn( $(black_box($arg)),* ));
+                }
+            }));
+
+            #[cfg(any( $($asm_meta),* ))]
+            group.bench_function(&format!(
+                "assembly ({} {})", std::env::consts::ARCH, std::env::consts::FAMILY
+            ), |b| b.iter(|| {
+                for ($($arg),*) in benchvec.iter().copied() {
+                    black_box(asm_fn( $(black_box($arg)),* ));
+                }
+            }));
+
+            group.finish();
+        }
+    }};
+
+    // Allow overriding a default
+    (@coalesce $specified:expr, $default:expr) => { $specified };
+    (@coalesce, $default:expr) => { $default };
+
+    // Allow overriding a function name
+    (@coalesce_fn $specified:ident => fn $default_name:ident $($tt:tt)+) => {
+        fn $specified $($tt)+
+    };
+    (@coalesce_fn => fn $default_name:ident $($tt:tt)+) => {
+        fn $default_name $($tt)+
+    };
+}
+
+/// A type used as either an input or output to/from a benchmark function.
+pub trait TestIO: Sized {
+    fn make_testvec(len: u32) -> Vec<Self>;
+    fn check_eq(a: Self, b: Self) -> bool;
+}
+
+macro_rules! impl_testio {
+    (float $($f_ty:ty),+) => {$(
+        impl TestIO for $f_ty {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ret = RefCell::new(Vec::new());
+                crate::fuzz_float(len, |a| ret.borrow_mut().push(a));
+                ret.into_inner()
+            }
+
+            fn check_eq(a: Self, b: Self) -> bool {
+                Float::eq_repr(a, b)
+            }
+        }
+
+        impl TestIO for ($f_ty, $f_ty) {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ret = RefCell::new(Vec::new());
+                crate::fuzz_float_2(len, |a, b| ret.borrow_mut().push((a, b)));
+                ret.into_inner()
+            }
+
+            fn check_eq(_a: Self, _b: Self) -> bool {
+                unimplemented!()
+            }
+        }
+    )*};
+
+    (int $($i_ty:ty),+) => {$(
+        impl TestIO for $i_ty {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ret = RefCell::new(Vec::new());
+                crate::fuzz(len, |a| ret.borrow_mut().push(a));
+                ret.into_inner()
+            }
+
+            fn check_eq(a: Self, b: Self) -> bool {
+                a == b
+            }
+        }
+
+        impl TestIO for ($i_ty, $i_ty) {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ret = RefCell::new(Vec::new());
+                crate::fuzz_2(len, |a, b| ret.borrow_mut().push((a, b)));
+                ret.into_inner()
+            }
+
+            fn check_eq(_a: Self, _b: Self) -> bool {
+                unimplemented!()
+            }
+        }
+    )*};
+
+    ((float, int) ($f_ty:ty, $i_ty:ty)) => {
+        impl TestIO for ($f_ty, $i_ty) {
+            fn make_testvec(len: u32) -> Vec<Self> {
+                // refcell because fuzz_* takes a `Fn`
+                let ivec = RefCell::new(Vec::new());
+                let fvec = RefCell::new(Vec::new());
+
+                crate::fuzz(len.isqrt(), |a| ivec.borrow_mut().push(a));
+                crate::fuzz_float(len.isqrt(), |a| fvec.borrow_mut().push(a));
+
+                let mut ret = Vec::new();
+                let ivec = ivec.into_inner();
+                let fvec = fvec.into_inner();
+
+                for f in fvec {
+                    for i in &ivec {
+                        ret.push((f, *i));
+                    }
+                }
+
+                ret
+            }
+
+            fn check_eq(_a: Self, _b: Self) -> bool {
+                unimplemented!()
+            }
+        }
+    }
+}
+
+#[cfg(f16_enabled)]
+impl_testio!(float f16);
+impl_testio!(float f32, f64);
+#[cfg(f128_enabled)]
+impl_testio!(float f128);
+impl_testio!(int i16, i32, i64, i128);
+impl_testio!(int u16, u32, u64, u128);
+impl_testio!((float, int)(f32, i32));
+impl_testio!((float, int)(f64, i32));
+#[cfg(f128_enabled)]
+impl_testio!((float, int)(f128, i32));
diff --git a/testcrate/src/lib.rs b/testcrate/src/lib.rs
index 9bd155f6..cc9e7393 100644
--- a/testcrate/src/lib.rs
+++ b/testcrate/src/lib.rs
@@ -13,9 +13,15 @@
 //! Some floating point tests are disabled for specific architectures, because they do not have
 //! correct rounding.
 #![no_std]
+#![cfg_attr(f128_enabled, feature(f128))]
+#![cfg_attr(f16_enabled, feature(f16))]
+#![feature(isqrt)]
+
+pub mod bench;
+extern crate alloc;
 
 use compiler_builtins::float::Float;
-use compiler_builtins::int::Int;
+use compiler_builtins::int::{Int, MinInt};
 
 use rand_xoshiro::rand_core::{RngCore, SeedableRng};
 use rand_xoshiro::Xoshiro128StarStar;
@@ -101,7 +107,10 @@ macro_rules! edge_cases {
 
 /// Feeds a series of fuzzing inputs to `f`. The fuzzer first uses an algorithm designed to find
 /// edge cases, followed by a more random fuzzer that runs `n` times.
-pub fn fuzz<I: Int, F: FnMut(I)>(n: u32, mut f: F) {
+pub fn fuzz<I: Int, F: FnMut(I)>(n: u32, mut f: F)
+where
+    <I as MinInt>::UnsignedInt: Int,
+{
     // edge case tester. Calls `f` 210 times for u128.
     // zero gets skipped by the loop
     f(I::ZERO);
@@ -111,7 +120,7 @@ pub fn fuzz<I: Int, F: FnMut(I)>(n: u32, mut f: F) {
 
     // random fuzzer
     let mut rng = Xoshiro128StarStar::seed_from_u64(0);
-    let mut x: I = Int::ZERO;
+    let mut x: I = MinInt::ZERO;
     for _ in 0..n {
         fuzz_step(&mut rng, &mut x);
         f(x)
@@ -119,7 +128,10 @@ pub fn fuzz<I: Int, F: FnMut(I)>(n: u32, mut f: F) {
 }
 
 /// The same as `fuzz`, except `f` has two inputs.
-pub fn fuzz_2<I: Int, F: Fn(I, I)>(n: u32, f: F) {
+pub fn fuzz_2<I: Int, F: Fn(I, I)>(n: u32, f: F)
+where
+    <I as MinInt>::UnsignedInt: Int,
+{
     // Check cases where the first and second inputs are zero. Both call `f` 210 times for `u128`.
     edge_cases!(I, case, {
         f(I::ZERO, case);
@@ -150,10 +162,10 @@ pub fn fuzz_shift<I: Int, F: Fn(I, u32)>(f: F) {
     // Shift functions are very simple and do not need anything other than shifting a small
     // set of random patterns for every fuzz length.
     let mut rng = Xoshiro128StarStar::seed_from_u64(0);
-    let mut x: I = Int::ZERO;
+    let mut x: I = MinInt::ZERO;
     for i in 0..I::FUZZ_NUM {
         fuzz_step(&mut rng, &mut x);
-        f(x, Int::ZERO);
+        f(x, MinInt::ZERO);
         f(x, I::FUZZ_LENGTHS[i] as u32);
     }
 }
@@ -257,3 +269,71 @@ pub fn fuzz_float_2<F: Float, E: Fn(F, F)>(n: u32, f: E) {
         f(x, y)
     }
 }
+
+/// Perform an operation using builtin types if available, falling back to apfloat if not.
+#[macro_export]
+macro_rules! apfloat_fallback {
+    (
+        $float_ty:ty,
+        // Type name in `rustc_apfloat::ieee`. Not a full path, it automatically gets the prefix.
+        $apfloat_ty:ident,
+        // Cfg expression for when builtin system operations should be used
+        $sys_available:meta,
+        // The expression to run. This expression may use `FloatTy` for its signature.
+        // Optionally, the final conversion back to a float can be suppressed using
+        // `=> no_convert` (for e.g. operations that return a bool).
+        //
+        // If the apfloat needs a different operation, it can be provided here.
+        $op:expr $(=> $convert:ident)? $(; $apfloat_op:expr)?,
+        // Arguments that get passed to `$op` after converting to a float
+        $($arg:expr),+
+        $(,)?
+    ) => {{
+        #[cfg($sys_available)]
+        let ret = {
+            type FloatTy = $float_ty;
+            $op( $($arg),+ )
+        };
+
+        #[cfg(not($sys_available))]
+        let ret = {
+            use rustc_apfloat::Float;
+            type FloatTy = rustc_apfloat::ieee::$apfloat_ty;
+
+            apfloat_fallback!(@inner
+                fty: $float_ty,
+                // Apply a conversion to `FloatTy` to each arg, then pass all args to `$op`
+                op_res: $op( $(FloatTy::from_bits($arg.to_bits().into())),+ ),
+                $(apfloat_op: $apfloat_op, )?
+                $(conv_opts: $convert,)?
+                args: $($arg),+
+            )
+        };
+
+        ret
+    }};
+
+    // Operations that do not need converting back to a float
+    (@inner fty: $float_ty:ty, op_res: $val:expr, conv_opts: no_convert, args: $($_arg:expr),+) => {
+        $val
+    };
+
+    // Some apfloat operations return a `StatusAnd` that we need to extract the value from. This
+    // is the default.
+    (@inner fty: $float_ty:ty, op_res: $val:expr, args: $($_arg:expr),+) => {{
+        // ignore the status, just get the value
+        let unwrapped = $val.value;
+
+        <$float_ty>::from_bits(FloatTy::to_bits(unwrapped).try_into().unwrap())
+    }};
+
+    // This is the case where we can't use the same expression for the default builtin and
+    // nonstandard apfloat fallback (e.g. `as` casts in std are normal functions in apfloat, so
+    // two separate expressions must be specified.
+    (@inner
+        fty: $float_ty:ty, op_res: $_val:expr,
+        apfloat_op: $apfloat_op:expr, args: $($arg:expr),+
+    ) => {{
+        $apfloat_op($($arg),+)
+    }};
+}
diff --git a/testcrate/tests/addsub.rs b/testcrate/tests/addsub.rs
index da7684ec..284a2bf5 100644
--- a/testcrate/tests/addsub.rs
+++ b/testcrate/tests/addsub.rs
@@ -1,126 +1,141 @@
 #![allow(unused_macros)]
+#![cfg_attr(f128_enabled, feature(f128))]
 
 use testcrate::*;
 
-macro_rules! sum {
-    ($($i:ty, $fn_add:ident, $fn_sub:ident);*;) => {
-        $(
-            fuzz_2(N, |x: $i, y: $i| {
-                let add0 = x.wrapping_add(y);
-                let sub0 = x.wrapping_sub(y);
-                let add1: $i = $fn_add(x, y);
-                let sub1: $i = $fn_sub(x, y);
-                if add0 != add1 {
-                    panic!(
-                        "{}({}, {}): std: {}, builtins: {}",
-                        stringify!($fn_add), x, y, add0, add1
-                    );
-                }
-                if sub0 != sub1 {
-                    panic!(
-                        "{}({}, {}): std: {}, builtins: {}",
-                        stringify!($fn_sub), x, y, sub0, sub1
-                    );
-                }
-            });
-        )*
-    };
-}
+mod int_addsub {
+    use super::*;
 
-macro_rules! overflowing_sum {
-    ($($i:ty, $fn_add:ident, $fn_sub:ident);*;) => {
-        $(
-            fuzz_2(N, |x: $i, y: $i| {
-                let add0 = x.overflowing_add(y);
-                let sub0 = x.overflowing_sub(y);
-                let add1: ($i, bool) = $fn_add(x, y);
-                let sub1: ($i, bool) = $fn_sub(x, y);
-                if add0.0 != add1.0 || add0.1 != add1.1 {
-                    panic!(
-                        "{}({}, {}): std: {:?}, builtins: {:?}",
-                        stringify!($fn_add), x, y, add0, add1
-                    );
-                }
-                if sub0.0 != sub1.0 || sub0.1 != sub1.1 {
-                    panic!(
-                        "{}({}, {}): std: {:?}, builtins: {:?}",
-                        stringify!($fn_sub), x, y, sub0, sub1
-                    );
+    macro_rules! sum {
+        ($($i:ty, $fn_add:ident, $fn_sub:ident);*;) => {
+            $(
+                #[test]
+                fn $fn_add() {
+                    use compiler_builtins::int::addsub::{$fn_add, $fn_sub};
+
+                    fuzz_2(N, |x: $i, y: $i| {
+                        let add0 = x.wrapping_add(y);
+                        let sub0 = x.wrapping_sub(y);
+                        let add1: $i = $fn_add(x, y);
+                        let sub1: $i = $fn_sub(x, y);
+                        if add0 != add1 {
+                            panic!(
+                                "{}({}, {}): std: {}, builtins: {}",
+                                stringify!($fn_add), x, y, add0, add1
+                            );
+                        }
+                        if sub0 != sub1 {
+                            panic!(
+                                "{}({}, {}): std: {}, builtins: {}",
+                                stringify!($fn_sub), x, y, sub0, sub1
+                            );
+                        }
+                    });
                 }
-            });
-        )*
-    };
-}
+            )*
+        };
+    }
+
+    macro_rules! overflowing_sum {
+        ($($i:ty, $fn_add:ident, $fn_sub:ident);*;) => {
+            $(
+                #[test]
+                fn $fn_add() {
+                    use compiler_builtins::int::addsub::{$fn_add, $fn_sub};
 
-#[test]
-fn addsub() {
-    use compiler_builtins::int::addsub::{
-        __rust_i128_add, __rust_i128_addo, __rust_i128_sub, __rust_i128_subo, __rust_u128_add,
-        __rust_u128_addo, __rust_u128_sub, __rust_u128_subo,
-    };
+                    fuzz_2(N, |x: $i, y: $i| {
+                        let add0 = x.overflowing_add(y);
+                        let sub0 = x.overflowing_sub(y);
+                        let add1: ($i, bool) = $fn_add(x, y);
+                        let sub1: ($i, bool) = $fn_sub(x, y);
+                        if add0.0 != add1.0 || add0.1 != add1.1 {
+                            panic!(
+                                "{}({}, {}): std: {:?}, builtins: {:?}",
+                                stringify!($fn_add), x, y, add0, add1
+                            );
+                        }
+                        if sub0.0 != sub1.0 || sub0.1 != sub1.1 {
+                            panic!(
+                                "{}({}, {}): std: {:?}, builtins: {:?}",
+                                stringify!($fn_sub), x, y, sub0, sub1
+                            );
+                        }
+                    });
+                }
+            )*
+        };
+    }
 
     // Integer addition and subtraction is very simple, so 100 fuzzing passes should be plenty.
-    sum!(
+    sum! {
         u128, __rust_u128_add, __rust_u128_sub;
         i128, __rust_i128_add, __rust_i128_sub;
-    );
-    overflowing_sum!(
+    }
+
+    overflowing_sum! {
         u128, __rust_u128_addo, __rust_u128_subo;
         i128, __rust_i128_addo, __rust_i128_subo;
-    );
+    }
 }
 
 macro_rules! float_sum {
-    ($($f:ty, $fn_add:ident, $fn_sub:ident);*;) => {
+    ($($f:ty, $fn_add:ident, $fn_sub:ident, $apfloat_ty:ident, $sys_available:meta);*;) => {
         $(
-            fuzz_float_2(N, |x: $f, y: $f| {
-                let add0 = x + y;
-                let sub0 = x - y;
-                let add1: $f = $fn_add(x, y);
-                let sub1: $f = $fn_sub(x, y);
-                if !Float::eq_repr(add0, add1) {
-                    panic!(
-                        "{}({}, {}): std: {}, builtins: {}",
-                        stringify!($fn_add), x, y, add0, add1
-                    );
-                }
-                if !Float::eq_repr(sub0, sub1) {
-                    panic!(
-                        "{}({}, {}): std: {}, builtins: {}",
-                        stringify!($fn_sub), x, y, sub0, sub1
-                    );
-                }
-            });
+            #[test]
+            fn $fn_add() {
+                use core::ops::{Add, Sub};
+                use compiler_builtins::float::{{add::$fn_add, sub::$fn_sub}, Float};
+
+                fuzz_float_2(N, |x: $f, y: $f| {
+                    let add0 = apfloat_fallback!($f, $apfloat_ty, $sys_available, Add::add, x, y);
+                    let sub0 = apfloat_fallback!($f, $apfloat_ty, $sys_available, Sub::sub, x, y);
+                    let add1: $f = $fn_add(x, y);
+                    let sub1: $f = $fn_sub(x, y);
+                    if !Float::eq_repr(add0, add1) {
+                        panic!(
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                            stringify!($fn_add), x, y, add0, add1
+                        );
+                    }
+                    if !Float::eq_repr(sub0, sub1) {
+                        panic!(
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                            stringify!($fn_sub), x, y, sub0, sub1
+                        );
+                    }
+                });
+            }
         )*
-    };
+    }
 }
 
 #[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
-#[test]
-fn float_addsub() {
-    use compiler_builtins::float::{
-        add::{__adddf3, __addsf3},
-        sub::{__subdf3, __subsf3},
-        Float,
-    };
-
-    float_sum!(
-        f32, __addsf3, __subsf3;
-        f64, __adddf3, __subdf3;
-    );
+mod float_addsub {
+    use super::*;
+
+    float_sum! {
+        f32, __addsf3, __subsf3, Single, all();
+        f64, __adddf3, __subdf3, Double, all();
+    }
 }
 
-#[cfg(target_arch = "arm")]
-#[test]
-fn float_addsub_arm() {
-    use compiler_builtins::float::{
-        add::{__adddf3vfp, __addsf3vfp},
-        sub::{__subdf3vfp, __subsf3vfp},
-        Float,
-    };
-
-    float_sum!(
-        f32, __addsf3vfp, __subsf3vfp;
-        f64, __adddf3vfp, __subdf3vfp;
-    );
+#[cfg(f128_enabled)]
+#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+#[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+mod float_addsub_f128 {
+    use super::*;
+
+    float_sum! {
+        f128, __addtf3, __subtf3, Quad, not(feature = "no-sys-f128");
+    }
+}
+
+#[cfg(f128_enabled)]
+#[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+mod float_addsub_f128_ppc {
+    use super::*;
+
+    float_sum! {
+        f128, __addkf3, __subkf3, Quad, not(feature = "no-sys-f128");
+    }
 }
diff --git a/testcrate/tests/aeabi_memclr.rs b/testcrate/tests/aeabi_memclr.rs
index 59507693..bfd15a39 100644
--- a/testcrate/tests/aeabi_memclr.rs
+++ b/testcrate/tests/aeabi_memclr.rs
@@ -5,7 +5,6 @@
     feature = "mem"
 ))]
 #![feature(compiler_builtins_lib)]
-#![feature(lang_items)]
 #![no_std]
 
 extern crate compiler_builtins;
diff --git a/testcrate/tests/aeabi_memcpy.rs b/testcrate/tests/aeabi_memcpy.rs
index 2d72dfbb..c892c5ab 100644
--- a/testcrate/tests/aeabi_memcpy.rs
+++ b/testcrate/tests/aeabi_memcpy.rs
@@ -5,7 +5,6 @@
     feature = "mem"
 ))]
 #![feature(compiler_builtins_lib)]
-#![feature(lang_items)]
 #![no_std]
 
 extern crate compiler_builtins;
diff --git a/testcrate/tests/aeabi_memset.rs b/testcrate/tests/aeabi_memset.rs
index f03729be..34ab3acc 100644
--- a/testcrate/tests/aeabi_memset.rs
+++ b/testcrate/tests/aeabi_memset.rs
@@ -5,7 +5,6 @@
     feature = "mem"
 ))]
 #![feature(compiler_builtins_lib)]
-#![feature(lang_items)]
 #![no_std]
 
 extern crate compiler_builtins;
diff --git a/testcrate/tests/big.rs b/testcrate/tests/big.rs
new file mode 100644
index 00000000..595f6225
--- /dev/null
+++ b/testcrate/tests/big.rs
@@ -0,0 +1,134 @@
+use compiler_builtins::int::{i256, u256, HInt, MinInt};
+
+const LOHI_SPLIT: u128 = 0xaaaaaaaaaaaaaaaaffffffffffffffff;
+
+/// Print a `u256` as hex since we can't add format implementations
+fn hexu(v: u256) -> String {
+    format!(
+        "0x{:016x}{:016x}{:016x}{:016x}",
+        v.0[3], v.0[2], v.0[1], v.0[0]
+    )
+}
+
+#[test]
+fn widen_u128() {
+    assert_eq!(u128::MAX.widen(), u256([u64::MAX, u64::MAX, 0, 0]));
+    assert_eq!(
+        LOHI_SPLIT.widen(),
+        u256([u64::MAX, 0xaaaaaaaaaaaaaaaa, 0, 0])
+    );
+}
+
+#[test]
+fn widen_i128() {
+    assert_eq!((-1i128).widen(), u256::MAX.signed());
+    assert_eq!(
+        (LOHI_SPLIT as i128).widen(),
+        i256([u64::MAX, 0xaaaaaaaaaaaaaaaa, u64::MAX, u64::MAX])
+    );
+    assert_eq!((-1i128).zero_widen().unsigned(), (u128::MAX).widen());
+}
+
+#[test]
+fn widen_mul_u128() {
+    let tests = [
+        (u128::MAX / 2, 2_u128, u256([u64::MAX - 1, u64::MAX, 0, 0])),
+        (u128::MAX, 2_u128, u256([u64::MAX - 1, u64::MAX, 1, 0])),
+        (u128::MAX, u128::MAX, u256([1, 0, u64::MAX - 1, u64::MAX])),
+        (u128::MIN, u128::MIN, u256::ZERO),
+        (1234, 0, u256::ZERO),
+        (0, 1234, u256::ZERO),
+    ];
+
+    let mut errors = Vec::new();
+    for (i, (a, b, exp)) in tests.iter().copied().enumerate() {
+        let res = a.widen_mul(b);
+        let res_z = a.zero_widen_mul(b);
+        assert_eq!(res, res_z);
+        if res != exp {
+            errors.push((i, a, b, exp, res));
+        }
+    }
+
+    for (i, a, b, exp, res) in &errors {
+        eprintln!(
+            "FAILURE ({i}): {a:#034x} * {b:#034x} = {} got {}",
+            hexu(*exp),
+            hexu(*res)
+        );
+    }
+    assert!(errors.is_empty());
+}
+
+#[test]
+fn not_u128() {
+    assert_eq!(!u256::ZERO, u256::MAX);
+}
+
+#[test]
+fn shr_u128() {
+    let only_low = [
+        1,
+        u16::MAX.into(),
+        u32::MAX.into(),
+        u64::MAX.into(),
+        u128::MAX,
+    ];
+
+    let mut errors = Vec::new();
+
+    for a in only_low {
+        for perturb in 0..10 {
+            let a = a.saturating_add(perturb);
+            for shift in 0..128 {
+                let res = a.widen() >> shift;
+                let expected = (a >> shift).widen();
+                if res != expected {
+                    errors.push((a.widen(), shift, res, expected));
+                }
+            }
+        }
+    }
+
+    let check = [
+        (
+            u256::MAX,
+            1,
+            u256([u64::MAX, u64::MAX, u64::MAX, u64::MAX >> 1]),
+        ),
+        (
+            u256::MAX,
+            5,
+            u256([u64::MAX, u64::MAX, u64::MAX, u64::MAX >> 5]),
+        ),
+        (u256::MAX, 63, u256([u64::MAX, u64::MAX, u64::MAX, 1])),
+        (u256::MAX, 64, u256([u64::MAX, u64::MAX, u64::MAX, 0])),
+        (u256::MAX, 65, u256([u64::MAX, u64::MAX, u64::MAX >> 1, 0])),
+        (u256::MAX, 127, u256([u64::MAX, u64::MAX, 1, 0])),
+        (u256::MAX, 128, u256([u64::MAX, u64::MAX, 0, 0])),
+        (u256::MAX, 129, u256([u64::MAX, u64::MAX >> 1, 0, 0])),
+        (u256::MAX, 191, u256([u64::MAX, 1, 0, 0])),
+        (u256::MAX, 192, u256([u64::MAX, 0, 0, 0])),
+        (u256::MAX, 193, u256([u64::MAX >> 1, 0, 0, 0])),
+        (u256::MAX, 191, u256([u64::MAX, 1, 0, 0])),
+        (u256::MAX, 254, u256([0b11, 0, 0, 0])),
+        (u256::MAX, 255, u256([1, 0, 0, 0])),
+    ];
+
+    for (input, shift, expected) in check {
+        let res = input >> shift;
+        if res != expected {
+            errors.push((input, shift, res, expected));
+        }
+    }
+
+    for (a, b, res, expected) in &errors {
+        eprintln!(
+            "FAILURE: {} >> {b} = {} got {}",
+            hexu(*a),
+            hexu(*expected),
+            hexu(*res),
+        );
+    }
+    assert!(errors.is_empty());
+}
diff --git a/testcrate/tests/cmp.rs b/testcrate/tests/cmp.rs
index 5c10a560..e3161f37 100644
--- a/testcrate/tests/cmp.rs
+++ b/testcrate/tests/cmp.rs
@@ -1,114 +1,188 @@
 #![allow(unused_macros)]
+#![allow(unreachable_code)]
+#![cfg_attr(f128_enabled, feature(f128))]
 
+#[cfg(not(target_arch = "powerpc64"))]
 use testcrate::*;
 
-macro_rules! cmp {
-    ($x:ident, $y:ident, $($unordered_val:expr, $fn:ident);*;) => {
-        $(
-            let cmp0 = if $x.is_nan() || $y.is_nan() {
-                $unordered_val
-            } else if $x < $y {
-                -1
-            } else if $x == $y {
-                0
-            } else {
-                1
-            };
-            let cmp1 = $fn($x, $y);
-            if cmp0 != cmp1 {
-                panic!("{}({}, {}): std: {}, builtins: {}", stringify!($fn_builtins), $x, $y, cmp0, cmp1);
-            }
-        )*
-    };
-}
-
 // PowerPC tests are failing on LLVM 13: https://github.com/rust-lang/rust/issues/88520
 #[cfg(not(target_arch = "powerpc64"))]
-#[test]
-fn float_comparisons() {
-    use compiler_builtins::float::cmp::{
-        __eqdf2, __eqsf2, __gedf2, __gesf2, __gtdf2, __gtsf2, __ledf2, __lesf2, __ltdf2, __ltsf2,
-        __nedf2, __nesf2, __unorddf2, __unordsf2,
-    };
-
-    fuzz_float_2(N, |x: f32, y: f32| {
-        assert_eq!(__unordsf2(x, y) != 0, x.is_nan() || y.is_nan());
-        cmp!(x, y,
-            1, __ltsf2;
-            1, __lesf2;
-            1, __eqsf2;
-            -1, __gesf2;
-            -1, __gtsf2;
-            1, __nesf2;
-        );
-    });
-    fuzz_float_2(N, |x: f64, y: f64| {
-        assert_eq!(__unorddf2(x, y) != 0, x.is_nan() || y.is_nan());
-        cmp!(x, y,
-            1, __ltdf2;
-            1, __ledf2;
-            1, __eqdf2;
-            -1, __gedf2;
-            -1, __gtdf2;
-            1, __nedf2;
-        );
-    });
-}
+mod float_comparisons {
+    use super::*;
+
+    macro_rules! cmp {
+        (
+            $f:ty, $x:ident, $y:ident, $apfloat_ty:ident, $sys_available:meta,
+            $($unordered_val:expr, $fn:ident);*;
+        ) => {
+            $(
+                let cmp0 = if apfloat_fallback!(
+                        $f, $apfloat_ty, $sys_available,
+                        |x: FloatTy| x.is_nan() => no_convert,
+                        $x
+                    ) || apfloat_fallback!(
+                        $f, $apfloat_ty, $sys_available,
+                        |y: FloatTy| y.is_nan() => no_convert,
+                        $y
+                    )
+                {
+                    $unordered_val
+                } else if apfloat_fallback!(
+                    $f, $apfloat_ty, $sys_available,
+                    |x, y| x < y => no_convert,
+                    $x, $y
+                ) {
+                    -1
+                } else if apfloat_fallback!(
+                    $f, $apfloat_ty, $sys_available,
+                    |x, y| x == y => no_convert,
+                    $x, $y
+                ) {
+                    0
+                } else {
+                    1
+                };
+
+                let cmp1 = $fn($x, $y);
+                if cmp0 != cmp1 {
+                    panic!(
+                        "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                        stringify!($fn), $x, $y, cmp0, cmp1
+                    );
+                }
+            )*
+        };
+    }
+
+    #[test]
+    fn cmp_f32() {
+        use compiler_builtins::float::cmp::{
+            __eqsf2, __gesf2, __gtsf2, __lesf2, __ltsf2, __nesf2, __unordsf2,
+        };
+
+        fuzz_float_2(N, |x: f32, y: f32| {
+            assert_eq!(__unordsf2(x, y) != 0, x.is_nan() || y.is_nan());
+            cmp!(f32, x, y, Single, all(),
+                1, __ltsf2;
+                1, __lesf2;
+                1, __eqsf2;
+                -1, __gesf2;
+                -1, __gtsf2;
+                1, __nesf2;
+            );
+        });
+    }
+
+    #[test]
+    fn cmp_f64() {
+        use compiler_builtins::float::cmp::{
+            __eqdf2, __gedf2, __gtdf2, __ledf2, __ltdf2, __nedf2, __unorddf2,
+        };
+
+        fuzz_float_2(N, |x: f64, y: f64| {
+            assert_eq!(__unorddf2(x, y) != 0, x.is_nan() || y.is_nan());
+            cmp!(f64, x, y, Double, all(),
+                1, __ltdf2;
+                1, __ledf2;
+                1, __eqdf2;
+                -1, __gedf2;
+                -1, __gtdf2;
+                1, __nedf2;
+            );
+        });
+    }
 
-macro_rules! cmp2 {
-    ($x:ident, $y:ident, $($unordered_val:expr, $fn_std:expr, $fn_builtins:ident);*;) => {
-        $(
-            let cmp0: i32 = if $x.is_nan() || $y.is_nan() {
-                $unordered_val
-            } else {
-                $fn_std as i32
-            };
-            let cmp1: i32 = $fn_builtins($x, $y);
-            if cmp0 != cmp1 {
-                panic!("{}({}, {}): std: {}, builtins: {}", stringify!($fn_builtins), $x, $y, cmp0, cmp1);
-            }
-        )*
-    };
+    #[test]
+    #[cfg(f128_enabled)]
+    fn cmp_f128() {
+        #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+        use compiler_builtins::float::cmp::{
+            __eqkf2 as __eqtf2, __gekf2 as __getf2, __gtkf2 as __gttf2, __lekf2 as __letf2,
+            __ltkf2 as __lttf2, __nekf2 as __netf2, __unordkf2 as __unordtf2,
+        };
+
+        #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+        use compiler_builtins::float::cmp::{
+            __eqtf2, __getf2, __gttf2, __letf2, __lttf2, __netf2, __unordtf2,
+        };
+
+        fuzz_float_2(N, |x: f128, y: f128| {
+            let x_is_nan = apfloat_fallback!(
+                f128, Quad, not(feature = "no-sys-f128"),
+                |x: FloatTy| x.is_nan() => no_convert,
+                x
+            );
+            let y_is_nan = apfloat_fallback!(
+                f128, Quad, not(feature = "no-sys-f128"),
+                |x: FloatTy| x.is_nan() => no_convert,
+                y
+            );
+
+            assert_eq!(__unordtf2(x, y) != 0, x_is_nan || y_is_nan);
+
+            cmp!(f128, x, y, Quad, not(feature = "no-sys-f128"),
+                1, __lttf2;
+                1, __letf2;
+                1, __eqtf2;
+                -1, __getf2;
+                -1, __gttf2;
+                1, __netf2;
+            );
+        });
+    }
 }
 
 #[cfg(target_arch = "arm")]
-#[test]
-fn float_comparisons_arm() {
-    use compiler_builtins::float::cmp::{
-        __aeabi_dcmpeq, __aeabi_dcmpge, __aeabi_dcmpgt, __aeabi_dcmple, __aeabi_dcmplt,
-        __aeabi_fcmpeq, __aeabi_fcmpge, __aeabi_fcmpgt, __aeabi_fcmple, __aeabi_fcmplt, __eqdf2vfp,
-        __eqsf2vfp, __gedf2vfp, __gesf2vfp, __gtdf2vfp, __gtsf2vfp, __ledf2vfp, __lesf2vfp,
-        __ltdf2vfp, __ltsf2vfp, __nedf2vfp, __nesf2vfp,
-    };
-
-    fuzz_float_2(N, |x: f32, y: f32| {
-        cmp2!(x, y,
-            0, x < y, __aeabi_fcmplt;
-            0, x <= y, __aeabi_fcmple;
-            0, x == y, __aeabi_fcmpeq;
-            0, x >= y, __aeabi_fcmpge;
-            0, x > y, __aeabi_fcmpgt;
-            0, x < y, __ltsf2vfp;
-            0, x <= y, __lesf2vfp;
-            0, x == y, __eqsf2vfp;
-            0, x >= y, __gesf2vfp;
-            0, x > y, __gtsf2vfp;
-            1, x != y, __nesf2vfp;
-        );
-    });
-    fuzz_float_2(N, |x: f64, y: f64| {
-        cmp2!(x, y,
-            0, x < y, __aeabi_dcmplt;
-            0, x <= y, __aeabi_dcmple;
-            0, x == y, __aeabi_dcmpeq;
-            0, x >= y, __aeabi_dcmpge;
-            0, x > y, __aeabi_dcmpgt;
-            0, x < y, __ltdf2vfp;
-            0, x <= y, __ledf2vfp;
-            0, x == y, __eqdf2vfp;
-            0, x >= y, __gedf2vfp;
-            0, x > y, __gtdf2vfp;
-            1, x != y, __nedf2vfp;
-        );
-    });
+mod float_comparisons_arm {
+    use super::*;
+
+    macro_rules! cmp2 {
+        ($x:ident, $y:ident, $($unordered_val:expr, $fn_std:expr, $fn_builtins:ident);*;) => {
+            $(
+                let cmp0: i32 = if $x.is_nan() || $y.is_nan() {
+                    $unordered_val
+                } else {
+                    $fn_std as i32
+                };
+                let cmp1: i32 = $fn_builtins($x, $y);
+                if cmp0 != cmp1 {
+                    panic!("{}({}, {}): std: {}, builtins: {}", stringify!($fn_builtins), $x, $y, cmp0, cmp1);
+                }
+            )*
+        };
+    }
+
+    #[test]
+    fn cmp_f32() {
+        use compiler_builtins::float::cmp::{
+            __aeabi_fcmpeq, __aeabi_fcmpge, __aeabi_fcmpgt, __aeabi_fcmple, __aeabi_fcmplt,
+        };
+
+        fuzz_float_2(N, |x: f32, y: f32| {
+            cmp2!(x, y,
+                0, x < y, __aeabi_fcmplt;
+                0, x <= y, __aeabi_fcmple;
+                0, x == y, __aeabi_fcmpeq;
+                0, x >= y, __aeabi_fcmpge;
+                0, x > y, __aeabi_fcmpgt;
+            );
+        });
+    }
+
+    #[test]
+    fn cmp_f64() {
+        use compiler_builtins::float::cmp::{
+            __aeabi_dcmpeq, __aeabi_dcmpge, __aeabi_dcmpgt, __aeabi_dcmple, __aeabi_dcmplt,
+        };
+
+        fuzz_float_2(N, |x: f64, y: f64| {
+            cmp2!(x, y,
+                0, x < y, __aeabi_dcmplt;
+                0, x <= y, __aeabi_dcmple;
+                0, x == y, __aeabi_dcmpeq;
+                0, x >= y, __aeabi_dcmpge;
+                0, x > y, __aeabi_dcmpgt;
+            );
+        });
+    }
 }
diff --git a/testcrate/tests/conv.rs b/testcrate/tests/conv.rs
index 2a70db17..a52382cb 100644
--- a/testcrate/tests/conv.rs
+++ b/testcrate/tests/conv.rs
@@ -1,75 +1,85 @@
+#![cfg_attr(f128_enabled, feature(f128))]
+#![cfg_attr(f16_enabled, feature(f16))]
+// makes configuration easier
+#![allow(unused_macros)]
+#![allow(unused_imports)]
+
+use compiler_builtins::float::Float;
+use rustc_apfloat::{Float as _, FloatConvert as _};
 use testcrate::*;
 
-macro_rules! i_to_f {
-    ($($from:ty, $into:ty, $fn:ident);*;) => {
-        $(
-            fuzz(N, |x: $from| {
-                let f0 = x as $into;
-                let f1: $into = $fn(x);
-                // This makes sure that the conversion produced the best rounding possible, and does
-                // this independent of `x as $into` rounding correctly.
-                // This assumes that float to integer conversion is correct.
-                let y_minus_ulp = <$into>::from_bits(f1.to_bits().wrapping_sub(1)) as $from;
-                let y = f1 as $from;
-                let y_plus_ulp = <$into>::from_bits(f1.to_bits().wrapping_add(1)) as $from;
-                let error_minus = <$from as Int>::abs_diff(y_minus_ulp, x);
-                let error = <$from as Int>::abs_diff(y, x);
-                let error_plus = <$from as Int>::abs_diff(y_plus_ulp, x);
-                // The first two conditions check that none of the two closest float values are
-                // strictly closer in representation to `x`. The second makes sure that rounding is
-                // towards even significand if two float values are equally close to the integer.
-                if error_minus < error
-                    || error_plus < error
-                    || ((error_minus == error || error_plus == error)
-                        && ((f0.to_bits() & 1) != 0))
-                {
-                    if !cfg!(any(
-                        target_arch = "powerpc",
-                        target_arch = "powerpc64"
-                    )) {
-                        panic!(
-                            "incorrect rounding by {}({}): {}, ({}, {}, {}), errors ({}, {}, {})",
-                            stringify!($fn),
-                            x,
-                            f1.to_bits(),
-                            y_minus_ulp,
-                            y,
-                            y_plus_ulp,
-                            error_minus,
-                            error,
-                            error_plus,
-                        );
-                    }
-                }
-                // Test against native conversion. We disable testing on all `x86` because of
-                // rounding bugs with `i686`. `powerpc` also has the same rounding bug.
-                if f0 != f1 && !cfg!(any(
-                    target_arch = "x86",
-                    target_arch = "powerpc",
-                    target_arch = "powerpc64"
-                )) {
-                    panic!(
-                        "{}({}): std: {}, builtins: {}",
-                        stringify!($fn),
-                        x,
-                        f0,
-                        f1,
-                    );
-                }
-            });
-        )*
-    };
-}
+mod int_to_float {
+    use super::*;
+
+    macro_rules! i_to_f {
+        ($($from:ty, $into:ty, $fn:ident);*;) => {
+            $(
+                #[test]
+                fn $fn() {
+                    use compiler_builtins::float::conv::$fn;
+                    use compiler_builtins::int::Int;
 
-#[test]
-fn int_to_float() {
-    use compiler_builtins::float::conv::{
-        __floatdidf, __floatdisf, __floatsidf, __floatsisf, __floattidf, __floattisf,
-        __floatundidf, __floatundisf, __floatunsidf, __floatunsisf, __floatuntidf, __floatuntisf,
-    };
-    use compiler_builtins::int::Int;
+                    fuzz(N, |x: $from| {
+                        let f0 = x as $into;
+                        let f1: $into = $fn(x);
+                        // This makes sure that the conversion produced the best rounding possible, and does
+                        // this independent of `x as $into` rounding correctly.
+                        // This assumes that float to integer conversion is correct.
+                        let y_minus_ulp = <$into>::from_bits(f1.to_bits().wrapping_sub(1)) as $from;
+                        let y = f1 as $from;
+                        let y_plus_ulp = <$into>::from_bits(f1.to_bits().wrapping_add(1)) as $from;
+                        let error_minus = <$from as Int>::abs_diff(y_minus_ulp, x);
+                        let error = <$from as Int>::abs_diff(y, x);
+                        let error_plus = <$from as Int>::abs_diff(y_plus_ulp, x);
+                        // The first two conditions check that none of the two closest float values are
+                        // strictly closer in representation to `x`. The second makes sure that rounding is
+                        // towards even significand if two float values are equally close to the integer.
+                        if error_minus < error
+                            || error_plus < error
+                            || ((error_minus == error || error_plus == error)
+                                && ((f0.to_bits() & 1) != 0))
+                        {
+                            if !cfg!(any(
+                                target_arch = "powerpc",
+                                target_arch = "powerpc64"
+                            )) {
+                                panic!(
+                                    "incorrect rounding by {}({}): {}, ({}, {}, {}), errors ({}, {}, {})",
+                                    stringify!($fn),
+                                    x,
+                                    f1.to_bits(),
+                                    y_minus_ulp,
+                                    y,
+                                    y_plus_ulp,
+                                    error_minus,
+                                    error,
+                                    error_plus,
+                                );
+                            }
+                        }
+                        // Test against native conversion. We disable testing on all `x86` because of
+                        // rounding bugs with `i686`. `powerpc` also has the same rounding bug.
+                        if f0 != f1 && !cfg!(any(
+                            target_arch = "x86",
+                            target_arch = "powerpc",
+                            target_arch = "powerpc64",
+                            target_family = "solana"
+                        )) {
+                            panic!(
+                                "{}({}): std: {}, builtins: {}",
+                                stringify!($fn),
+                                x,
+                                f0,
+                                f1,
+                            );
+                        }
+                    });
+                }
+            )*
+        };
+    }
 
-    i_to_f!(
+    i_to_f! {
         u32, f32, __floatunsisf;
         u32, f64, __floatunsidf;
         i32, f32, __floatsisf;
@@ -82,51 +92,219 @@ fn int_to_float() {
         u128, f64, __floatuntidf;
         i128, f32, __floattisf;
         i128, f64, __floattidf;
-    );
+    }
 }
 
-macro_rules! f_to_i {
-    ($x:ident, $($f:ty, $fn:ident);*;) => {
+// PowerPC tests are failing on LLVM 13: https://github.com/rust-lang/rust/issues/88520
+#[cfg(not(target_arch = "powerpc64"))]
+mod f_to_i {
+    use super::*;
+
+    macro_rules! f_to_i {
+        ($x:ident, $f_ty:ty, $apfloat_ty:ident, $sys_available:meta, $($i_ty:ty, $fn:ident);*;) => {
+            $(
+                // it is undefined behavior in the first place to do conversions with NaNs
+                if !apfloat_fallback!(
+                    $f_ty, $apfloat_ty, $sys_available, |x: FloatTy| x.is_nan() => no_convert, $x
+                ) {
+                    let conv0 = apfloat_fallback!(
+                        $f_ty, $apfloat_ty, $sys_available,
+                        // Use an `as` cast when the builtin is available on the system.
+                        |x| x as $i_ty;
+                        // When the builtin is not available, we need to use a different conversion
+                        // method (since apfloat doesn't support `as` casting).
+                        |x: $f_ty| {
+                            use compiler_builtins::int::MinInt;
+
+                            let apf = FloatTy::from_bits(x.to_bits().into());
+                            let bits: usize = <$i_ty>::BITS.try_into().unwrap();
+
+                            let err_fn = || panic!(
+                                "Unable to convert value {x:?} to type {}:", stringify!($i_ty)
+                            );
+
+                            if <$i_ty>::SIGNED {
+                               <$i_ty>::try_from(apf.to_i128(bits).value).ok().unwrap_or_else(err_fn)
+                            } else {
+                               <$i_ty>::try_from(apf.to_u128(bits).value).ok().unwrap_or_else(err_fn)
+                            }
+                        },
+                        $x
+                    );
+                    let conv1: $i_ty = $fn($x);
+                    if conv0 != conv1 {
+                        panic!("{}({:?}): std: {:?}, builtins: {:?}", stringify!($fn), $x, conv0, conv1);
+                    }
+                }
+            )*
+        };
+    }
+
+    #[test]
+    fn f32_to_int() {
+        use compiler_builtins::float::conv::{
+            __fixsfdi, __fixsfsi, __fixsfti, __fixunssfdi, __fixunssfsi, __fixunssfti,
+        };
+
+        fuzz_float(N, |x: f32| {
+            f_to_i!(x, f32, Single, all(),
+                u32, __fixunssfsi;
+                u64, __fixunssfdi;
+                u128, __fixunssfti;
+                i32, __fixsfsi;
+                i64, __fixsfdi;
+                i128, __fixsfti;
+            );
+        });
+    }
+
+    #[test]
+    fn f64_to_int() {
+        use compiler_builtins::float::conv::{
+            __fixdfdi, __fixdfsi, __fixdfti, __fixunsdfdi, __fixunsdfsi, __fixunsdfti,
+        };
+
+        fuzz_float(N, |x: f64| {
+            f_to_i!(x, f64, Double, all(),
+                u32, __fixunsdfsi;
+                u64, __fixunsdfdi;
+                u128, __fixunsdfti;
+                i32, __fixdfsi;
+                i64, __fixdfdi;
+                i128, __fixdfti;
+            );
+        });
+    }
+
+    #[test]
+    #[cfg(f128_enabled)]
+    fn f128_to_int() {
+        #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+        use compiler_builtins::float::conv::{
+            __fixkfdi as __fixtfdi, __fixkfsi as __fixtfsi, __fixkfti as __fixtfti,
+            __fixunskfdi as __fixunstfdi, __fixunskfsi as __fixunstfsi,
+            __fixunskfti as __fixunstfti,
+        };
+        #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+        use compiler_builtins::float::conv::{
+            __fixtfdi, __fixtfsi, __fixtfti, __fixunstfdi, __fixunstfsi, __fixunstfti,
+        };
+
+        fuzz_float(N, |x: f128| {
+            f_to_i!(
+                x,
+                f128,
+                Quad,
+                not(feature = "no-sys-f128-int-convert"),
+                u32, __fixunstfsi;
+                u64, __fixunstfdi;
+                u128, __fixunstfti;
+                i32, __fixtfsi;
+                i64, __fixtfdi;
+                i128, __fixtfti;
+            );
+        });
+    }
+}
+
+macro_rules! f_to_f {
+    (
+        $mod:ident,
         $(
-            // it is undefined behavior in the first place to do conversions with NaNs
-            if !$x.is_nan() {
-                let conv0 = $x as $f;
-                let conv1: $f = $fn($x);
-                if conv0 != conv1 {
-                    panic!("{}({}): std: {}, builtins: {}", stringify!($fn), $x, conv0, conv1);
+            $from_ty:ty => $to_ty:ty,
+            $from_ap_ty:ident => $to_ap_ty:ident,
+            $fn:ident, $sys_available:meta
+        );+;
+    ) => {$(
+        #[test]
+        fn $fn() {
+            use compiler_builtins::float::{$mod::$fn, Float};
+            use rustc_apfloat::ieee::{$from_ap_ty, $to_ap_ty};
+
+            fuzz_float(N, |x: $from_ty| {
+                let tmp0: $to_ty = apfloat_fallback!(
+                    $from_ty,
+                    $from_ap_ty,
+                    $sys_available,
+                    |x: $from_ty| x as $to_ty;
+                    |x: $from_ty| {
+                        let from_apf = FloatTy::from_bits(x.to_bits().into());
+                        // Get `value` directly to ignore INVALID_OP
+                        let to_apf: $to_ap_ty = from_apf.convert(&mut false).value;
+                        <$to_ty>::from_bits(to_apf.to_bits().try_into().unwrap())
+                    },
+                    x
+                );
+                let tmp1: $to_ty = $fn(x);
+
+                if !Float::eq_repr(tmp0, tmp1) {
+                    panic!(
+                        "{}({:?}): std: {:?}, builtins: {:?}",
+                        stringify!($fn),
+                        x,
+                        tmp0,
+                        tmp1
+                    );
                 }
-            }
-        )*
-    };
+            })
+        }
+    )+};
 }
 
-// PowerPC tests are failing on LLVM 13: https://github.com/rust-lang/rust/issues/88520
-#[cfg(not(target_arch = "powerpc64"))]
-#[test]
-fn float_to_int() {
-    use compiler_builtins::float::conv::{
-        __fixdfdi, __fixdfsi, __fixdfti, __fixsfdi, __fixsfsi, __fixsfti, __fixunsdfdi,
-        __fixunsdfsi, __fixunsdfti, __fixunssfdi, __fixunssfsi, __fixunssfti,
-    };
-
-    fuzz_float(N, |x: f32| {
-        f_to_i!(x,
-            u32, __fixunssfsi;
-            u64, __fixunssfdi;
-            u128, __fixunssfti;
-            i32, __fixsfsi;
-            i64, __fixsfdi;
-            i128, __fixsfti;
-        );
-    });
-    fuzz_float(N, |x: f64| {
-        f_to_i!(x,
-            u32, __fixunsdfsi;
-            u64, __fixunsdfdi;
-            u128, __fixunsdfti;
-            i32, __fixdfsi;
-            i64, __fixdfdi;
-            i128, __fixdfti;
-        );
-    });
+mod extend {
+    use super::*;
+
+    f_to_f! {
+        extend,
+        f32 => f64, Single => Double, __extendsfdf2, all();
+    }
+
+    #[cfg(all(f16_enabled, f128_enabled))]
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    f_to_f! {
+        extend,
+        f16 => f32, Half => Single, __extendhfsf2, not(feature = "no-sys-f16");
+        f16 => f32, Half => Single, __gnu_h2f_ieee, not(feature = "no-sys-f16");
+        f16 => f128, Half => Quad, __extendhftf2, not(feature = "no-sys-f16-f128-convert");
+        f32 => f128, Single => Quad, __extendsftf2, not(feature = "no-sys-f128");
+        f64 => f128, Double => Quad, __extenddftf2, not(feature = "no-sys-f128");
+    }
+
+    #[cfg(f128_enabled)]
+    #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+    f_to_f! {
+        extend,
+        // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
+        f32 => f128, Single => Quad, __extendsfkf2, not(feature = "no-sys-f128");
+        f64 => f128, Double => Quad, __extenddfkf2, not(feature = "no-sys-f128");
+    }
+}
+
+mod trunc {
+    use super::*;
+
+    f_to_f! {
+        trunc,
+        f64 => f32, Double => Single, __truncdfsf2, all();
+    }
+
+    #[cfg(all(f16_enabled, f128_enabled))]
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    f_to_f! {
+        trunc,
+        f32 => f16, Single => Half, __truncsfhf2, not(feature = "no-sys-f16");
+        f32 => f16, Single => Half, __gnu_f2h_ieee, not(feature = "no-sys-f16");
+        f128 => f16, Quad => Half, __trunctfhf2, not(feature = "no-sys-f16-f128-convert");
+        f128 => f32, Quad => Single, __trunctfsf2, not(feature = "no-sys-f128");
+        f128 => f64, Quad => Double, __trunctfdf2, not(feature = "no-sys-f128");
+    }
+
+    #[cfg(f128_enabled)]
+    #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+    f_to_f! {
+        trunc,
+        // FIXME(#655): `f16` tests disabled until we can bootstrap symbols
+        f128 => f32, Quad => Single, __trunckfsf2, not(feature = "no-sys-f128");
+        f128 => f64, Quad => Double, __trunckfdf2, not(feature = "no-sys-f128");
+    }
 }
diff --git a/testcrate/tests/div_rem.rs b/testcrate/tests/div_rem.rs
index de3bd9be..b7af47e3 100644
--- a/testcrate/tests/div_rem.rs
+++ b/testcrate/tests/div_rem.rs
@@ -1,7 +1,9 @@
+#![feature(f128)]
 #![allow(unused_macros)]
 
 use compiler_builtins::int::sdiv::{__divmoddi4, __divmodsi4, __divmodti4};
 use compiler_builtins::int::udiv::{__udivmoddi4, __udivmodsi4, __udivmodti4, u128_divide_sparc};
+
 use testcrate::*;
 
 // Division algorithms have by far the nastiest and largest number of edge cases, and experience shows
@@ -104,58 +106,64 @@ fn divide_sparc() {
 }
 
 macro_rules! float {
-    ($($i:ty, $fn:ident);*;) => {
+    ($($f:ty, $fn:ident, $apfloat_ty:ident, $sys_available:meta);*;) => {
         $(
-            fuzz_float_2(N, |x: $i, y: $i| {
-                let quo0 = x / y;
-                let quo1: $i = $fn(x, y);
-                #[cfg(not(target_arch = "arm"))]
-                if !Float::eq_repr(quo0, quo1) {
-                    panic!(
-                        "{}({}, {}): std: {}, builtins: {}",
-                        stringify!($fn), x, y, quo0, quo1
-                    );
-                }
+            #[test]
+            fn $fn() {
+                use compiler_builtins::float::{div::$fn, Float};
+                use core::ops::Div;
+
+                fuzz_float_2(N, |x: $f, y: $f| {
+                    let quo0: $f = apfloat_fallback!($f, $apfloat_ty, $sys_available, Div::div, x, y);
+                    let quo1: $f = $fn(x, y);
+
+                    // ARM SIMD instructions always flush subnormals to zero
+                    if cfg!(target_arch = "arm") &&
+                        ((Float::is_subnormal(quo0)) || Float::is_subnormal(quo1)) {
+                        return;
+                    }
 
-                // ARM SIMD instructions always flush subnormals to zero
-                #[cfg(target_arch = "arm")]
-                if !(Float::is_subnormal(quo0) || Float::is_subnormal(quo1)) {
                     if !Float::eq_repr(quo0, quo1) {
                         panic!(
-                            "{}({}, {}): std: {}, builtins: {}",
-                            stringify!($fn), x, y, quo0, quo1
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                            stringify!($fn),
+                            x,
+                            y,
+                            quo0,
+                            quo1
                         );
                     }
-                }
-            });
+                });
+            }
         )*
     };
 }
 
 #[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
-#[test]
-fn float_div() {
-    use compiler_builtins::float::{
-        div::{__divdf3, __divsf3},
-        Float,
-    };
+mod float_div {
+    use super::*;
 
-    float!(
-        f32, __divsf3;
-        f64, __divdf3;
-    );
-}
+    #[cfg(not(any(
+        all(target_arch = "x86", not(target_feature = "sse")),
+        target_family = "solana"
+    )))]
+    float! {
+        f32, __divsf3, Single, all();
+        f64, __divdf3, Double, all();
+    }
 
-#[cfg(target_arch = "arm")]
-#[test]
-fn float_div_arm() {
-    use compiler_builtins::float::{
-        div::{__divdf3vfp, __divsf3vfp},
-        Float,
-    };
+    #[cfg(not(feature = "no-f16-f128"))]
+    #[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+    float! {
+        f128, __divtf3, Quad,
+        // FIXME(llvm): there is a bug in LLVM rt.
+        // See <https://github.com/llvm/llvm-project/issues/91840>.
+        not(any(feature = "no-sys-f128", all(target_arch = "aarch64", target_os = "linux")));
+    }
 
-    float!(
-        f32, __divsf3vfp;
-        f64, __divdf3vfp;
-    );
+    #[cfg(not(feature = "no-f16-f128"))]
+    #[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+    float! {
+        f128, __divkf3, Quad, not(feature = "no-sys-f128");
+    }
 }
diff --git a/testcrate/tests/float_pow.rs b/testcrate/tests/float_pow.rs
new file mode 100644
index 00000000..d85ee99d
--- /dev/null
+++ b/testcrate/tests/float_pow.rs
@@ -0,0 +1,72 @@
+#![allow(unused_macros)]
+#![cfg_attr(f128_enabled, feature(f128))]
+#![cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+
+use testcrate::*;
+
+// This is approximate because of issues related to
+// https://github.com/rust-lang/rust/issues/73920.
+// TODO how do we resolve this indeterminacy?
+macro_rules! pow {
+    ($($f:ty, $tolerance:expr, $fn:ident, $sys_available:meta);*;) => {
+        $(
+            #[test]
+            // FIXME(apfloat): We skip tests if system symbols aren't available rather
+            // than providing a fallback, since `rustc_apfloat` does not provide `pow`.
+            #[cfg($sys_available)]
+            fn $fn() {
+                use compiler_builtins::float::pow::$fn;
+                use compiler_builtins::float::Float;
+                fuzz_float_2(N, |x: $f, y: $f| {
+                    if !(Float::is_subnormal(x) || Float::is_subnormal(y) || x.is_nan()) {
+                        let n = y.to_bits() & !<$f as Float>::SIGNIFICAND_MASK;
+                        let n = (n as <$f as Float>::SignedInt) >> <$f as Float>::SIGNIFICAND_BITS;
+                        let n = n as i32;
+                        let tmp0: $f = x.powi(n);
+                        let tmp1: $f = $fn(x, n);
+                        let (a, b) = if tmp0 < tmp1 {
+                            (tmp0, tmp1)
+                        } else {
+                            (tmp1, tmp0)
+                        };
+
+                        let good = if a == b {
+                            // handles infinity equality
+                            true
+                        } else if a < $tolerance {
+                            b < $tolerance
+                        } else {
+                            let quo = b / a;
+                            (quo < (1. + $tolerance)) && (quo > (1. - $tolerance))
+                        };
+
+                        assert!(
+                            good,
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
+                            stringify!($fn), x, n, tmp0, tmp1
+                        );
+                    }
+                });
+            }
+        )*
+    };
+}
+
+pow! {
+    f32, 1e-4, __powisf2, all();
+    f64, 1e-12, __powidf2, all();
+}
+
+#[cfg(f128_enabled)]
+// FIXME(f16_f128): MSVC cannot build these until `__divtf3` is available in nightly.
+#[cfg(not(target_env = "msvc"))]
+#[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+pow! {
+    f128, 1e-36, __powitf2, not(feature = "no-sys-f128");
+}
+
+#[cfg(f128_enabled)]
+#[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+pow! {
+    f128, 1e-36, __powikf2, not(feature = "no-sys-f128");
+}
diff --git a/testcrate/tests/lse.rs b/testcrate/tests/lse.rs
index 5589f22f..cbecd614 100644
--- a/testcrate/tests/lse.rs
+++ b/testcrate/tests/lse.rs
@@ -1,5 +1,5 @@
 #![feature(decl_macro)] // so we can use pub(super)
-#![cfg(all(target_arch = "aarch64", not(feature = "no-asm")))]
+#![cfg(all(target_arch = "aarch64", target_os = "linux", not(feature = "no-asm")))]
 
 /// Translate a byte size to a Rust type.
 macro int_ty {
diff --git a/testcrate/tests/mem.rs b/testcrate/tests/mem.rs
index 5099d69e..59ae0bc5 100644
--- a/testcrate/tests/mem.rs
+++ b/testcrate/tests/mem.rs
@@ -37,6 +37,7 @@ fn memcpy_10() {
     }
 }
 
+#[cfg(not(target_os = "solana"))]
 #[test]
 fn memcpy_big() {
     // Make the arrays cross 3 pages
@@ -163,6 +164,7 @@ fn memmove_forward_misaligned_nonaligned_start() {
     }
 }
 
+#[cfg(not(target_os = "solana"))]
 #[test]
 fn memmove_forward_misaligned_aligned_start() {
     let mut arr = gen_arr::<32>();
diff --git a/testcrate/tests/misc.rs b/testcrate/tests/misc.rs
index 537ba1e6..f5ac2ab7 100644
--- a/testcrate/tests/misc.rs
+++ b/testcrate/tests/misc.rs
@@ -1,7 +1,6 @@
 // makes configuration easier
 #![allow(unused_macros)]
 
-use compiler_builtins::float::Float;
 use testcrate::*;
 
 /// Make sure that the the edge case tester and randomized tester don't break, and list examples of
@@ -66,149 +65,145 @@ fn fuzz_values() {
 
 #[test]
 fn leading_zeros() {
-    use compiler_builtins::int::__clzsi2;
-    use compiler_builtins::int::leading_zeros::{
-        usize_leading_zeros_default, usize_leading_zeros_riscv,
-    };
-    fuzz(N, |x: usize| {
-        let lz = x.leading_zeros() as usize;
-        let lz0 = __clzsi2(x);
-        let lz1 = usize_leading_zeros_default(x);
-        let lz2 = usize_leading_zeros_riscv(x);
-        if lz0 != lz {
-            panic!("__clzsi2({}): std: {}, builtins: {}", x, lz, lz0);
-        }
-        if lz1 != lz {
-            panic!(
-                "usize_leading_zeros_default({}): std: {}, builtins: {}",
-                x, lz, lz1
-            );
-        }
-        if lz2 != lz {
-            panic!(
-                "usize_leading_zeros_riscv({}): std: {}, builtins: {}",
-                x, lz, lz2
-            );
-        }
-    })
-}
-
-macro_rules! extend {
-    ($fX:ident, $fD:ident, $fn:ident) => {
-        fuzz_float(N, |x: $fX| {
-            let tmp0 = x as $fD;
-            let tmp1: $fD = $fn(x);
-            if !Float::eq_repr(tmp0, tmp1) {
+    use compiler_builtins::int::leading_zeros::{leading_zeros_default, leading_zeros_riscv};
+    {
+        use compiler_builtins::int::leading_zeros::__clzsi2;
+        fuzz(N, |x: u32| {
+            if x == 0 {
+                return; // undefined value for an intrinsic
+            }
+            let lz = x.leading_zeros() as usize;
+            let lz0 = __clzsi2(x);
+            let lz1 = leading_zeros_default(x);
+            let lz2 = leading_zeros_riscv(x);
+            if lz0 != lz {
+                panic!("__clzsi2({}): std: {}, builtins: {}", x, lz, lz0);
+            }
+            if lz1 != lz {
                 panic!(
-                    "{}({}): std: {}, builtins: {}",
-                    stringify!($fn),
-                    x,
-                    tmp0,
-                    tmp1
+                    "leading_zeros_default({}): std: {}, builtins: {}",
+                    x, lz, lz1
                 );
             }
+            if lz2 != lz {
+                panic!("leading_zeros_riscv({}): std: {}, builtins: {}", x, lz, lz2);
+            }
         });
-    };
-}
-
-// PowerPC tests are failing on LLVM 13: https://github.com/rust-lang/rust/issues/88520
-#[cfg(not(target_arch = "powerpc64"))]
-#[test]
-fn float_extend() {
-    use compiler_builtins::float::extend::__extendsfdf2;
-
-    extend!(f32, f64, __extendsfdf2);
-}
-
-#[cfg(target_arch = "arm")]
-#[test]
-fn float_extend_arm() {
-    use compiler_builtins::float::extend::__extendsfdf2vfp;
-
-    extend!(f32, f64, __extendsfdf2vfp);
-}
-
-// This is approximate because of issues related to
-// https://github.com/rust-lang/rust/issues/73920.
-// TODO how do we resolve this indeterminacy?
-macro_rules! pow {
-    ($($f:ty, $tolerance:expr, $fn:ident);*;) => {
-        $(
-            fuzz_float_2(N, |x: $f, y: $f| {
-                if !(Float::is_subnormal(x) || Float::is_subnormal(y) || x.is_nan()) {
-                    let n = y.to_bits() & !<$f as Float>::SIGNIFICAND_MASK;
-                    let n = (n as <$f as Float>::SignedInt) >> <$f as Float>::SIGNIFICAND_BITS;
-                    let n = n as i32;
-                    let tmp0: $f = x.powi(n);
-                    let tmp1: $f = $fn(x, n);
-                    let (a, b) = if tmp0 < tmp1 {
-                        (tmp0, tmp1)
-                    } else {
-                        (tmp1, tmp0)
-                    };
-                    let good = {
-                        if a == b {
-                            // handles infinity equality
-                            true
-                        } else if a < $tolerance {
-                            b < $tolerance
-                        } else {
-                            let quo = b / a;
-                            (quo < (1. + $tolerance)) && (quo > (1. - $tolerance))
-                        }
-                    };
-                    if !good {
-                        panic!(
-                            "{}({}, {}): std: {}, builtins: {}",
-                            stringify!($fn), x, n, tmp0, tmp1
-                        );
-                    }
-                }
-            });
-        )*
-    };
-}
-
-#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
-#[test]
-fn float_pow() {
-    use compiler_builtins::float::pow::{__powidf2, __powisf2};
+    }
 
-    pow!(
-        f32, 1e-4, __powisf2;
-        f64, 1e-12, __powidf2;
-    );
-}
-
-macro_rules! trunc {
-    ($fX:ident, $fD:ident, $fn:ident) => {
-        fuzz_float(N, |x: $fX| {
-            let tmp0 = x as $fD;
-            let tmp1: $fD = $fn(x);
-            if !Float::eq_repr(tmp0, tmp1) {
+    {
+        use compiler_builtins::int::leading_zeros::__clzdi2;
+        fuzz(N, |x: u64| {
+            if x == 0 {
+                return; // undefined value for an intrinsic
+            }
+            let lz = x.leading_zeros() as usize;
+            let lz0 = __clzdi2(x);
+            let lz1 = leading_zeros_default(x);
+            let lz2 = leading_zeros_riscv(x);
+            if lz0 != lz {
+                panic!("__clzdi2({}): std: {}, builtins: {}", x, lz, lz0);
+            }
+            if lz1 != lz {
                 panic!(
-                    "{}({}): std: {}, builtins: {}",
-                    stringify!($fn),
-                    x,
-                    tmp0,
-                    tmp1
+                    "leading_zeros_default({}): std: {}, builtins: {}",
+                    x, lz, lz1
                 );
             }
+            if lz2 != lz {
+                panic!("leading_zeros_riscv({}): std: {}, builtins: {}", x, lz, lz2);
+            }
+        });
+    }
+
+    {
+        use compiler_builtins::int::leading_zeros::__clzti2;
+        fuzz(N, |x: u128| {
+            if x == 0 {
+                return; // undefined value for an intrinsic
+            }
+            let lz = x.leading_zeros() as usize;
+            let lz0 = __clzti2(x);
+            if lz0 != lz {
+                panic!("__clzti2({}): std: {}, builtins: {}", x, lz, lz0);
+            }
         });
-    };
+    }
 }
 
 #[test]
-fn float_trunc() {
-    use compiler_builtins::float::trunc::__truncdfsf2;
-
-    trunc!(f64, f32, __truncdfsf2);
+fn trailing_zeros() {
+    use compiler_builtins::int::trailing_zeros::{__ctzdi2, __ctzsi2, __ctzti2, trailing_zeros};
+    fuzz(N, |x: u32| {
+        if x == 0 {
+            return; // undefined value for an intrinsic
+        }
+        let tz = x.trailing_zeros() as usize;
+        let tz0 = __ctzsi2(x);
+        let tz1 = trailing_zeros(x);
+        if tz0 != tz {
+            panic!("__ctzsi2({}): std: {}, builtins: {}", x, tz, tz0);
+        }
+        if tz1 != tz {
+            panic!("trailing_zeros({}): std: {}, builtins: {}", x, tz, tz1);
+        }
+    });
+    fuzz(N, |x: u64| {
+        if x == 0 {
+            return; // undefined value for an intrinsic
+        }
+        let tz = x.trailing_zeros() as usize;
+        let tz0 = __ctzdi2(x);
+        let tz1 = trailing_zeros(x);
+        if tz0 != tz {
+            panic!("__ctzdi2({}): std: {}, builtins: {}", x, tz, tz0);
+        }
+        if tz1 != tz {
+            panic!("trailing_zeros({}): std: {}, builtins: {}", x, tz, tz1);
+        }
+    });
+    fuzz(N, |x: u128| {
+        if x == 0 {
+            return; // undefined value for an intrinsic
+        }
+        let tz = x.trailing_zeros() as usize;
+        let tz0 = __ctzti2(x);
+        if tz0 != tz {
+            panic!("__ctzti2({}): std: {}, builtins: {}", x, tz, tz0);
+        }
+    });
 }
 
-#[cfg(target_arch = "arm")]
 #[test]
-fn float_trunc_arm() {
-    use compiler_builtins::float::trunc::__truncdfsf2vfp;
+#[cfg(not(target_arch = "avr"))]
+fn bswap() {
+    use compiler_builtins::int::bswap::{__bswapdi2, __bswapsi2};
+    fuzz(N, |x: u32| {
+        assert_eq!(x.swap_bytes(), __bswapsi2(x));
+    });
+    fuzz(N, |x: u64| {
+        assert_eq!(x.swap_bytes(), __bswapdi2(x));
+    });
+
+    assert_eq!(__bswapsi2(0x12345678u32), 0x78563412u32);
+    assert_eq!(__bswapsi2(0x00000001u32), 0x01000000u32);
+    assert_eq!(__bswapdi2(0x123456789ABCDEF0u64), 0xF0DEBC9A78563412u64);
+    assert_eq!(__bswapdi2(0x0200000001000000u64), 0x0000000100000002u64);
+
+    #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
+    {
+        use compiler_builtins::int::bswap::__bswapti2;
+        fuzz(N, |x: u128| {
+            assert_eq!(x.swap_bytes(), __bswapti2(x));
+        });
 
-    trunc!(f64, f32, __truncdfsf2vfp);
+        assert_eq!(
+            __bswapti2(0x123456789ABCDEF013579BDF02468ACEu128),
+            0xCE8A4602DF9B5713F0DEBC9A78563412u128
+        );
+        assert_eq!(
+            __bswapti2(0x04000000030000000200000001000000u128),
+            0x00000001000000020000000300000004u128
+        );
+    }
 }
diff --git a/testcrate/tests/mul.rs b/testcrate/tests/mul.rs
index 819f06ca..449d1948 100644
--- a/testcrate/tests/mul.rs
+++ b/testcrate/tests/mul.rs
@@ -1,130 +1,155 @@
 #![allow(unused_macros)]
+#![cfg_attr(f128_enabled, feature(f128))]
 
 use testcrate::*;
 
-macro_rules! mul {
-    ($($i:ty, $fn:ident);*;) => {
-        $(
-            fuzz_2(N, |x: $i, y: $i| {
-                let mul0 = x.wrapping_mul(y);
-                let mul1: $i = $fn(x, y);
-                if mul0 != mul1 {
-                    panic!(
-                        "{}({}, {}): std: {}, builtins: {}",
-                        stringify!($fn), x, y, mul0, mul1
-                    );
-                }
-            });
-        )*
-    };
-}
+mod int_mul {
+    use super::*;
+
+    macro_rules! mul {
+        ($($i:ty, $fn:ident);*;) => {
+            $(
+                #[test]
+                fn $fn() {
+                    use compiler_builtins::int::mul::$fn;
 
-#[test]
-fn mul() {
-    use compiler_builtins::int::mul::{__muldi3, __multi3};
+                    fuzz_2(N, |x: $i, y: $i| {
+                        let mul0 = x.wrapping_mul(y);
+                        let mul1: $i = $fn(x, y);
+                        if mul0 != mul1 {
+                            panic!(
+                                "{}({}, {}): std: {}, builtins: {}",
+                                stringify!($fn), x, y, mul0, mul1
+                            );
+                        }
+                    });
 
-    mul!(
+                }
+            )*
+        };
+    }
+
+    mul! {
         u64, __muldi3;
         i128, __multi3;
-    );
+    }
 }
 
-macro_rules! overflowing_mul {
-    ($($i:ty, $fn:ident);*;) => {
-        $(
-            fuzz_2(N, |x: $i, y: $i| {
-                let (mul0, o0) = x.overflowing_mul(y);
-                let mut o1 = 0i32;
-                let mul1: $i = $fn(x, y, &mut o1);
-                let o1 = o1 != 0;
-                if mul0 != mul1 || o0 != o1 {
-                    panic!(
-                        "{}({}, {}): std: ({}, {}), builtins: ({}, {})",
-                        stringify!($fn), x, y, mul0, o0, mul1, o1
-                    );
-                }
-            });
-        )*
-    };
-}
+mod int_overflowing_mul {
+    use super::*;
 
-#[test]
-fn overflowing_mul() {
-    use compiler_builtins::int::mul::{
-        __mulodi4, __mulosi4, __muloti4, __rust_i128_mulo, __rust_u128_mulo,
-    };
+    macro_rules! overflowing_mul {
+        ($($i:ty, $fn:ident);*;) => {
+            $(
+                #[test]
+                fn $fn() {
+                    use compiler_builtins::int::mul::$fn;
+
+                    fuzz_2(N, |x: $i, y: $i| {
+                        let (mul0, o0) = x.overflowing_mul(y);
+                        let mut o1 = 0i32;
+                        let mul1: $i = $fn(x, y, &mut o1);
+                        let o1 = o1 != 0;
+                        if mul0 != mul1 || o0 != o1 {
+                            panic!(
+                                "{}({}, {}): std: ({}, {}), builtins: ({}, {})",
+                                stringify!($fn), x, y, mul0, o0, mul1, o1
+                            );
+                        }
+                    });
+                }
+            )*
+        };
+    }
 
-    overflowing_mul!(
+    overflowing_mul! {
         i32, __mulosi4;
         i64, __mulodi4;
         i128, __muloti4;
-    );
-    fuzz_2(N, |x: u128, y: u128| {
-        let (mul0, o0) = x.overflowing_mul(y);
-        let (mul1, o1) = __rust_u128_mulo(x, y);
-        if mul0 != mul1 || o0 != o1 {
-            panic!(
-                "__rust_u128_mulo({}, {}): std: ({}, {}), builtins: ({}, {})",
-                x, y, mul0, o0, mul1, o1
-            );
-        }
-        let x = x as i128;
-        let y = y as i128;
-        let (mul0, o0) = x.overflowing_mul(y);
-        let (mul1, o1) = __rust_i128_mulo(x, y);
-        if mul0 != mul1 || o0 != o1 {
-            panic!(
-                "__rust_i128_mulo({}, {}): std: ({}, {}), builtins: ({}, {})",
-                x, y, mul0, o0, mul1, o1
-            );
-        }
-    });
+    }
+
+    #[test]
+    fn overflowing_mul_u128() {
+        use compiler_builtins::int::mul::{__rust_i128_mulo, __rust_u128_mulo};
+
+        fuzz_2(N, |x: u128, y: u128| {
+            let (mul0, o0) = x.overflowing_mul(y);
+            let (mul1, o1) = __rust_u128_mulo(x, y);
+            if mul0 != mul1 || o0 != o1 {
+                panic!(
+                    "__rust_u128_mulo({}, {}): std: ({}, {}), builtins: ({}, {})",
+                    x, y, mul0, o0, mul1, o1
+                );
+            }
+            let x = x as i128;
+            let y = y as i128;
+            let (mul0, o0) = x.overflowing_mul(y);
+            let (mul1, o1) = __rust_i128_mulo(x, y);
+            if mul0 != mul1 || o0 != o1 {
+                panic!(
+                    "__rust_i128_mulo({}, {}): std: ({}, {}), builtins: ({}, {})",
+                    x, y, mul0, o0, mul1, o1
+                );
+            }
+        });
+    }
 }
 
 macro_rules! float_mul {
-    ($($f:ty, $fn:ident);*;) => {
+    ($($f:ty, $fn:ident, $apfloat_ty:ident, $sys_available:meta);*;) => {
         $(
-            fuzz_float_2(N, |x: $f, y: $f| {
-                let mul0 = x * y;
-                let mul1: $f = $fn(x, y);
-                // multiplication of subnormals is not currently handled
-                if !(Float::is_subnormal(mul0) || Float::is_subnormal(mul1)) {
+            #[test]
+            fn $fn() {
+                use compiler_builtins::float::{mul::$fn, Float};
+                use core::ops::Mul;
+
+                fuzz_float_2(N, |x: $f, y: $f| {
+                    let mul0 = apfloat_fallback!($f, $apfloat_ty, $sys_available, Mul::mul, x, y);
+                    let mul1: $f = $fn(x, y);
                     if !Float::eq_repr(mul0, mul1) {
                         panic!(
-                            "{}({}, {}): std: {}, builtins: {}",
+                            "{}({:?}, {:?}): std: {:?}, builtins: {:?}",
                             stringify!($fn), x, y, mul0, mul1
                         );
                     }
-                }
-            });
+                });
+            }
         )*
     };
 }
 
 #[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
-#[test]
-fn float_mul() {
-    use compiler_builtins::float::{
-        mul::{__muldf3, __mulsf3},
-        Float,
-    };
+mod float_mul {
+    use super::*;
 
-    float_mul!(
-        f32, __mulsf3;
-        f64, __muldf3;
-    );
+    // FIXME(#616): Stop ignoring arches that don't have native support once fix for builtins is in
+    // nightly.
+    float_mul! {
+        f32, __mulsf3, Single, not(target_arch = "arm");
+        f64, __muldf3, Double, not(target_arch = "arm");
+    }
 }
 
-#[cfg(target_arch = "arm")]
-#[test]
-fn float_mul_arm() {
-    use compiler_builtins::float::{
-        mul::{__muldf3vfp, __mulsf3vfp},
-        Float,
-    };
+#[cfg(f128_enabled)]
+#[cfg(not(all(target_arch = "x86", not(target_feature = "sse"))))]
+#[cfg(not(any(target_arch = "powerpc", target_arch = "powerpc64")))]
+mod float_mul_f128 {
+    use super::*;
+
+    float_mul! {
+        f128, __multf3, Quad,
+        // FIXME(llvm): there is a bug in LLVM rt.
+        // See <https://github.com/llvm/llvm-project/issues/91840>.
+        not(any(feature = "no-sys-f128", all(target_arch = "aarch64", target_os = "linux")));
+    }
+}
+
+#[cfg(f128_enabled)]
+#[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
+mod float_mul_f128_ppc {
+    use super::*;
 
-    float_mul!(
-        f32, __mulsf3vfp;
-        f64, __muldf3vfp;
-    );
+    float_mul! {
+        f128, __mulkf3, Quad, not(feature = "no-sys-f128");
+    }
 }
diff --git a/testcrate/tests/shift.rs b/testcrate/tests/shift.rs
index 7a76b164..23e3395e 100644
--- a/testcrate/tests/shift.rs
+++ b/testcrate/tests/shift.rs
@@ -3,35 +3,33 @@ use testcrate::*;
 macro_rules! shift {
     ($($i:ty, $fn_std:ident, $fn_builtins:ident);*;) => {
         $(
-            fuzz_shift(|x: $i, s: u32| {
-                let tmp0: $i = x.$fn_std(s);
-                let tmp1: $i = $fn_builtins(x, s);
-                if tmp0 != tmp1 {
-                    panic!(
-                        "{}({}, {}): std: {}, builtins: {}",
-                        stringify!($fn_builtins), x, s, tmp0, tmp1
-                    );
-                }
-            });
+            #[test]
+            fn $fn_builtins() {
+                use compiler_builtins::int::shift::$fn_builtins;
+
+                fuzz_shift(|x: $i, s: u32| {
+                    let tmp0: $i = x.$fn_std(s);
+                    let tmp1: $i = $fn_builtins(x, s);
+                    if tmp0 != tmp1 {
+                        panic!(
+                            "{}({}, {}): std: {}, builtins: {}",
+                            stringify!($fn_builtins), x, s, tmp0, tmp1
+                        );
+                    }
+                });
+            }
         )*
     };
 }
 
-#[test]
-fn shift() {
-    use compiler_builtins::int::shift::{
-        __ashldi3, __ashlsi3, __ashlti3, __ashrdi3, __ashrsi3, __ashrti3, __lshrdi3, __lshrsi3,
-        __lshrti3,
-    };
-    shift!(
-        u32, wrapping_shl, __ashlsi3;
-        u64, wrapping_shl, __ashldi3;
-        u128, wrapping_shl, __ashlti3;
-        i32, wrapping_shr, __ashrsi3;
-        i64, wrapping_shr, __ashrdi3;
-        i128, wrapping_shr, __ashrti3;
-        u32, wrapping_shr, __lshrsi3;
-        u64, wrapping_shr, __lshrdi3;
-        u128, wrapping_shr, __lshrti3;
-    );
+shift! {
+    u32, wrapping_shl, __ashlsi3;
+    u64, wrapping_shl, __ashldi3;
+    u128, wrapping_shl, __ashlti3;
+    i32, wrapping_shr, __ashrsi3;
+    i64, wrapping_shr, __ashrdi3;
+    i128, wrapping_shr, __ashrti3;
+    u32, wrapping_shr, __lshrsi3;
+    u64, wrapping_shr, __lshrdi3;
+    u128, wrapping_shr, __lshrti3;
 }