From 78a63efce67173fcc4cba4827cc94b4186c77a54 Mon Sep 17 00:00:00 2001
From: Ivo Gabe de Wolff <ivogabe@ivogabe.nl>
Date: Fri, 17 Nov 2023 23:44:43 +0100
Subject: [PATCH 1/2] Align shared memory in fold & scan (only shuffle)
 Previously, shared memory was allocated without any padding. This caused that
 reads and stores may be misaligned, for instance when scanning an array
 containing (Bool, Int). This commit only fixes this issue for folds and scans
 using shuffle instructions. Fixing this for folds and scans on onlder
 hardware is possible, but probably not worth it given the age of that
 hardware and complexity of the fix.

---
 .../Array/Accelerate/LLVM/PTX/CodeGen/Base.hs | 39 +++++++++-
 .../Array/Accelerate/LLVM/PTX/CodeGen/Fold.hs | 20 ++---
 .../Accelerate/LLVM/PTX/CodeGen/FoldSeg.hs    | 10 +--
 .../Array/Accelerate/LLVM/PTX/CodeGen/Scan.hs | 73 +++++--------------
 4 files changed, 67 insertions(+), 75 deletions(-)

diff --git a/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Base.hs b/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Base.hs
index a92e3d14d..8855c850a 100644
--- a/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Base.hs
+++ b/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Base.hs
@@ -43,6 +43,7 @@ module Data.Array.Accelerate.LLVM.PTX.CodeGen.Base (
 
   -- Shared memory
   staticSharedMem,
+  sharedMemorySizeAdd,
   dynamicSharedMem,
   sharedMemAddrSpace, sharedMemVolatility,
 
@@ -676,6 +677,24 @@ initialiseDynamicSharedMemory = do
     }
   return $ ConstantOperand $ GlobalReference (PrimType (PtrPrimType (ArrayPrimType 0 scalarType) sharedMemAddrSpace)) "__shared__"
 
+sharedMemorySizeAdd
+  :: TypeR e
+  -> Int -- number of array elements
+  -> Int -- #bytes of shared memory the have already been allocated
+  -> Int
+sharedMemorySizeAdd tp n i = case tp of
+  TupRunit -> i
+  TupRpair t2 t1 ->
+    -- First handle the second element of the tuple, then the first,
+    -- to match the behaviour of dynamicSharedMem
+    sharedMemorySizeAdd t2 n $ sharedMemorySizeAdd t1 n i
+  TupRsingle t ->
+    let
+      bytes = bytesElt tp
+      -- Align 'i' to the alignment of t
+      aligned = alignTo (scalarAlignment t) i
+    in
+      aligned + bytes * n
 
 -- Declared a new dynamically allocated array in the __shared__ memory space
 -- with enough space to contain the given number of elements.
@@ -700,14 +719,18 @@ dynamicSharedMem tp int n@(op int -> m) (op int -> offset)
           (i2, p2) <- go t2 i1
           return $ (i2, OP_Pair p2 p1)
         go (TupRsingle t)   i  = do
+          let bytes = bytesElt (TupRsingle t)
+          let align = scalarAlignment t
+          i' <- instr' $ Add numTp i (A.integral int $ P.fromIntegral $ align - 1)
+          aligned <- instr' $ BAnd int i' (A.integral int $ P.fromIntegral $ Data.Bits.complement $ align - 1)
 #if MIN_VERSION_llvm_hs(15,0,0)
-          p <- instr' $ GetElementPtr scalarType smem [i]
+          p <- instr' $ GetElementPtr scalarType smem [aligned]
 #else
-          p <- instr' $ GetElementPtr scalarType smem [A.num numTp 0, i] -- TLM: note initial zero index!!
+          p <- instr' $ GetElementPtr scalarType smem [A.num numTp 0, aligned] -- TLM: note initial zero index!!
 #endif
           q <- instr' $ PtrCast (PtrPrimType (ScalarPrimType t) sharedMemAddrSpace) p
-          a <- instr' $ Mul numTp m (A.integral int (P.fromIntegral (bytesElt (TupRsingle t))))
-          b <- instr' $ Add numTp i a
+          a <- instr' $ Mul numTp m (A.integral int (P.fromIntegral bytes))
+          b <- instr' $ Add numTp aligned a
           return (b, ir t (unPtr q))
     --
     (_, ad) <- go tp offset
@@ -803,3 +826,11 @@ makeKernel config name@(Label l) param kernel = do
                      }
     }
 
+scalarAlignment :: ScalarType t -> Int
+scalarAlignment t@(SingleScalarType _) = bytesElt (TupRsingle t)
+scalarAlignment (VectorScalarType (VectorType _ t)) = bytesElt (TupRsingle $ SingleScalarType t)
+
+-- Align 'ptr' to the given alignment.
+-- Assumes 'align' is a power of 2.
+alignTo :: Int -> Int -> Int
+alignTo align ptr = (ptr + align - 1) .&. Data.Bits.complement (align - 1)
diff --git a/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Fold.hs b/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Fold.hs
index 456249303..25064d016 100644
--- a/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Fold.hs
+++ b/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Fold.hs
@@ -134,8 +134,8 @@ mkFoldAllS uid dev aenv tp combine mseed marr =
       --
       config              = launchConfig dev (CUDA.incWarp dev) smem multipleOf multipleOfQ
       smem n
-        | canShfl dev     = warps * bytes
-        | otherwise       = warps * (1 + per_warp) * bytes
+        | canShfl dev     = sharedMemorySizeAdd tp warps 0
+        | otherwise       = warps * (1 + per_warp) * bytes -- This does not take alignment into account
         where
           ws        = CUDA.warpSize dev
           warps     = n `P.quot` ws
@@ -195,8 +195,8 @@ mkFoldAllM1 uid dev aenv tp combine marr =
       --
       config              = launchConfig dev (CUDA.incWarp dev) smem const [|| const ||]
       smem n
-        | canShfl dev     = warps * bytes
-        | otherwise       = warps * (1 + per_warp) * bytes
+        | canShfl dev     = sharedMemorySizeAdd tp warps 0
+        | otherwise       = warps * (1 + per_warp) * bytes -- This does not take alignment into account
         where
           ws        = CUDA.warpSize dev
           warps     = n `P.quot` ws
@@ -254,8 +254,8 @@ mkFoldAllM2 uid dev aenv tp combine mseed =
       --
       config              = launchConfig dev (CUDA.incWarp dev) smem const [|| const ||]
       smem n
-        | canShfl dev     = warps * bytes
-        | otherwise       = warps * (1 + per_warp) * bytes
+        | canShfl dev     = sharedMemorySizeAdd tp warps 0
+        | otherwise       = warps * (1 + per_warp) * bytes -- This does not take alignment into account
         where
           ws        = CUDA.warpSize dev
           warps     = n `P.quot` ws
@@ -325,8 +325,8 @@ mkFoldDim uid aenv repr@(ArrayR shr tp) combine mseed marr = do
       --
       config              = launchConfig dev (CUDA.incWarp dev) smem const [|| const ||]
       smem n
-        | canShfl dev     = warps * bytes
-        | otherwise       = warps * (1 + per_warp) * bytes
+        | canShfl dev     = sharedMemorySizeAdd tp warps 0
+        | otherwise       = warps * (1 + per_warp) * bytes -- This does not take alignment into account
         where
           ws        = CUDA.warpSize dev
           warps     = n `P.quot` ws
@@ -479,7 +479,7 @@ reduceBlockSMem dev tp combine size = warpReduce >=> warpAggregate
     warpReduce input = do
       -- Allocate (1.5 * warpSize) elements of shared memory for each warp
       wid   <- warpId
-      skip  <- A.mul numType wid (int32 (warp_smem_elems * bytes))
+      skip  <- A.mul numType wid (int32 (warp_smem_elems * bytes)) -- This does not take alignment into account
       smem  <- dynamicSharedMem tp TypeInt32 (int32 warp_smem_elems) skip
 
       -- Are we doing bounds checking for this warp?
@@ -505,7 +505,7 @@ reduceBlockSMem dev tp combine size = warpReduce >=> warpAggregate
       -- Allocate #warps elements of shared memory
       bd    <- blockDim
       warps <- A.quot integralType bd (int32 (CUDA.warpSize dev))
-      skip  <- A.mul numType warps (int32 (warp_smem_elems * bytes))
+      skip  <- A.mul numType warps (int32 (warp_smem_elems * bytes)) -- This does not take alignment into account
       smem  <- dynamicSharedMem tp TypeInt32 warps skip
 
       -- Share the per-lane aggregates
diff --git a/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/FoldSeg.hs b/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/FoldSeg.hs
index 5e5e0e462..9dfc6e11f 100644
--- a/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/FoldSeg.hs
+++ b/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/FoldSeg.hs
@@ -100,8 +100,8 @@ mkFoldSegP_block uid aenv repr@(ArrayR shr tp) intTp combine mseed marr mseg = d
       --
       config              = launchConfig dev (CUDA.decWarp dev) dsmem const [|| const ||]
       dsmem n
-        | canShfl dev     = warps * bytes
-        | otherwise       = warps * (1 + per_warp) * bytes
+        | canShfl dev     = sharedMemorySizeAdd tp warps 0
+        | otherwise       = warps * (1 + per_warp) * bytes -- This does not take alignment into account
         where
           ws        = CUDA.warpSize dev
           warps     = n `P.quot` ws
@@ -307,7 +307,7 @@ mkFoldSegP_warp uid aenv repr@(ArrayR shr tp) intTp combine mseed marr mseg = do
       --
       per_warp_bytes
         | canShfl dev     = 0
-        | otherwise       = (2 * sizeOf (undefined::Int)) `P.max` (bytesElt tp * per_warp_elems)
+        | otherwise       = (2 * sizeOf (undefined::Int)) `P.max` (bytesElt tp * per_warp_elems) -- This does not take alignment into account
       per_warp_elems      = ws + (ws `P.quot` 2)
       ws                  = CUDA.warpSize dev
 
@@ -358,7 +358,7 @@ mkFoldSegP_warp uid aenv repr@(ArrayR shr tp) intTp combine mseed marr mseg = do
     lim   <- case canShfl dev of
       True  -> return (unsmem (TupRsingle scalarTypeInt))
       False -> do
-        a <- A.mul numType wid (int32 per_warp_bytes)
+        a <- A.mul numType wid (int32 per_warp_bytes) -- This does not take alignment into account
         b <- dynamicSharedMem (TupRsingle scalarTypeInt) TypeInt32 (liftInt32 0) a
         return b
 
@@ -371,7 +371,7 @@ mkFoldSegP_warp uid aenv repr@(ArrayR shr tp) intTp combine mseed marr mseg = do
     smem  <- case canShfl dev of
       True  -> return (unsmem tp)
       False -> do
-        a <- A.mul numType wid (int32 per_warp_bytes)
+        a <- A.mul numType wid (int32 per_warp_bytes) -- This does not take alignment into account
         b <- dynamicSharedMem tp TypeInt32 (int32 per_warp_elems) a
         return b
 
diff --git a/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Scan.hs b/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Scan.hs
index 252e4f793..618257677 100644
--- a/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Scan.hs
+++ b/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Scan.hs
@@ -159,15 +159,7 @@ mkScanAllP1 dir uid aenv tp combine mseed marr = do
       end                 = indexHead (irArrayShape arrTmp)
       paramEnv            = envParam aenv
       --
-      config              = launchConfig dev (CUDA.incWarp dev) smem const [|| const ||]
-      smem n
-        | canShfl dev     = warps * bytes
-        | otherwise       = warps * (1 + per_warp) * bytes
-        where
-          ws        = CUDA.warpSize dev
-          warps     = n `P.quot` ws
-          per_warp  = ws + ws `P.quot` 2
-          bytes     = bytesElt tp
+      config              = launchConfig dev (CUDA.incWarp dev) (scanSMemSize dev tp) const [|| const ||]
   --
   makeOpenAccWith config uid "scanP1" (paramTmp ++ paramOut ++ paramIn ++ paramEnv) $ do
 
@@ -279,17 +271,9 @@ mkScanAllP2 dir uid aenv tp combine = do
       start               = liftInt 0
       end                 = indexHead (irArrayShape arrTmp)
       --
-      config              = launchConfig dev (CUDA.incWarp dev) smem grid gridQ
+      config              = launchConfig dev (CUDA.incWarp dev) (scanSMemSize dev tp) grid gridQ
       grid _ _            = 1
       gridQ               = [|| \_ _ -> 1 ||]
-      smem n
-        | canShfl dev     = warps * bytes
-        | otherwise       = warps * (1 + per_warp) * bytes
-        where
-          ws        = CUDA.warpSize dev
-          warps     = n `P.quot` ws
-          per_warp  = ws + ws `P.quot` 2
-          bytes     = bytesElt tp
   --
   makeOpenAccWith config uid "scanP2" (paramTmp ++ paramEnv) $ do
 
@@ -476,15 +460,7 @@ mkScan'AllP1 dir uid aenv tp combine seed marr = do
       end                 = indexHead (irArrayShape arrTmp)
       paramEnv            = envParam aenv
       --
-      config              = launchConfig dev (CUDA.incWarp dev) smem const [|| const ||]
-      smem n
-        | canShfl dev     = warps * bytes
-        | otherwise       = warps * (1 + per_warp) * bytes
-        where
-          ws        = CUDA.warpSize dev
-          warps     = n `P.quot` ws
-          per_warp  = ws + ws `P.quot` 2
-          bytes     = bytesElt tp
+      config              = launchConfig dev (CUDA.incWarp dev) (scanSMemSize dev tp) const [|| const ||]
   --
   makeOpenAccWith config uid "scanP1" (paramTmp ++ paramOut ++ paramIn ++ paramEnv) $ do
 
@@ -590,17 +566,9 @@ mkScan'AllP2 dir uid aenv tp combine = do
       start               = liftInt 0
       end                 = indexHead (irArrayShape arrTmp)
       --
-      config              = launchConfig dev (CUDA.incWarp dev) smem grid gridQ
+      config              = launchConfig dev (CUDA.incWarp dev) (scanSMemSize dev tp) grid gridQ
       grid _ _            = 1
       gridQ               = [|| \_ _ -> 1 ||]
-      smem n
-        | canShfl dev     = warps * bytes
-        | otherwise       = warps * (1 + per_warp) * bytes
-        where
-          ws        = CUDA.warpSize dev
-          warps     = n `P.quot` ws
-          per_warp  = ws + ws `P.quot` 2
-          bytes     = bytesElt tp
   --
   makeOpenAccWith config uid "scanP2" (paramTmp ++ paramSum ++ paramEnv) $ do
 
@@ -786,15 +754,7 @@ mkScanDim dir uid aenv repr@(ArrayR (ShapeRsnoc shr) tp) combine mseed marr = do
       (arrIn,  paramIn)   = delayedArray "in" marr
       paramEnv            = envParam aenv
       --
-      config              = launchConfig dev (CUDA.incWarp dev) smem const [|| const ||]
-      smem n
-        | canShfl dev     = warps * bytes
-        | otherwise       = warps * (1 + per_warp) * bytes
-        where
-          ws        = CUDA.warpSize dev
-          warps     = n `P.quot` ws
-          per_warp  = ws + ws `P.quot` 2
-          bytes     = bytesElt tp
+      config              = launchConfig dev (CUDA.incWarp dev) (scanSMemSize dev tp) const [|| const ||]
   --
   makeOpenAccWith config uid "scan" (paramOut ++ paramIn ++ paramEnv) $ do
 
@@ -987,15 +947,7 @@ mkScan'Dim dir uid aenv repr@(ArrayR (ShapeRsnoc shr) tp) combine seed marr = do
       (arrIn,  paramIn)   = delayedArray "in" marr
       paramEnv            = envParam aenv
       --
-      config              = launchConfig dev (CUDA.incWarp dev) smem const [|| const ||]
-      smem n
-        | canShfl dev     = warps * bytes
-        | otherwise       = warps * (1 + per_warp) * bytes
-        where
-          ws        = CUDA.warpSize dev
-          warps     = n `P.quot` ws
-          per_warp  = ws + ws `P.quot` 2
-          bytes     = bytesElt tp
+      config              = launchConfig dev (CUDA.incWarp dev) (scanSMemSize dev tp) const [|| const ||]
   --
   makeOpenAccWith config uid "scan" (paramOut ++ paramSum ++ paramIn ++ paramEnv) $ do
 
@@ -1181,6 +1133,15 @@ mkScan'Fill
 mkScan'Fill uid aenv repr seed =
   Safe.coerce <$> mkGenerate uid aenv (reduceRank repr) (IRFun1 (const seed))
 
+scanSMemSize :: DeviceProperties -> TypeR e -> Int -> Int
+scanSMemSize dev tp n
+  | canShfl dev = sharedMemorySizeAdd tp warps 0
+  | otherwise   = warps * (1 + per_warp) * bytes -- This does not take alignment into account
+  where
+    ws        = CUDA.warpSize dev
+    warps     = n `P.quot` ws
+    per_warp  = ws + ws `P.quot` 2
+    bytes     = bytesElt tp
 
 -- Block wide scan
 -- ---------------
@@ -1240,7 +1201,7 @@ scanBlockSMem dir dev tp combine nelem = warpScan >=> warpPrefix
       -- Allocate (1.5 * warpSize) elements of shared memory for each warp
       -- (individually addressable by each warp)
       wid   <- warpId
-      skip  <- A.mul numType wid (int32 warp_smem_bytes)
+      skip  <- A.mul numType wid (int32 warp_smem_bytes) -- This does not take alignment into account
       smem  <- dynamicSharedMem tp TypeInt32 (int32 warp_smem_elems) skip
       scanWarpSMem dir dev tp combine smem input
 
@@ -1252,7 +1213,7 @@ scanBlockSMem dir dev tp combine nelem = warpScan >=> warpPrefix
       -- Allocate #warps elements of shared memory
       bd    <- blockDim
       warps <- A.quot integralType bd (int32 (CUDA.warpSize dev))
-      skip  <- A.mul numType warps (int32 warp_smem_bytes)
+      skip  <- A.mul numType warps (int32 warp_smem_bytes) -- This does not take alignment into account
       smem  <- dynamicSharedMem tp TypeInt32 warps skip
 
       -- Share warp aggregates

From 3a89ce4336100d8abe61289a3e95d41d6e4e99f2 Mon Sep 17 00:00:00 2001
From: Tom Smeding <tom@tomsmeding.com>
Date: Sun, 15 Dec 2024 19:58:09 +0100
Subject: [PATCH 2/2] Drop support for compute capability < 3.0

---
 .../Array/Accelerate/LLVM/PTX/CodeGen/Base.hs |   6 -
 .../Array/Accelerate/LLVM/PTX/CodeGen/Fold.hs | 204 ++----------------
 .../Accelerate/LLVM/PTX/CodeGen/FoldSeg.hs    |  96 ++-------
 .../Array/Accelerate/LLVM/PTX/CodeGen/Scan.hs | 170 +--------------
 .../Data/Array/Accelerate/LLVM/PTX/Compile.hs |  12 ++
 5 files changed, 58 insertions(+), 430 deletions(-)

diff --git a/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Base.hs b/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Base.hs
index 8855c850a..7ad3decfd 100644
--- a/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Base.hs
+++ b/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Base.hs
@@ -39,7 +39,6 @@ module Data.Array.Accelerate.LLVM.PTX.CodeGen.Base (
 
   -- Warp shuffle instructions
   __shfl_up, __shfl_down, __shfl_idx, __broadcast,
-  canShfl,
 
   -- Shared memory
   staticSharedMem,
@@ -382,11 +381,6 @@ __shfl_idx = shfl Idx
 __broadcast :: TypeR a -> Operands a -> CodeGen PTX (Operands a)
 __broadcast aR a = __shfl_idx aR a (liftWord32 0)
 
--- Warp shuffle instructions are available for compute capability >= 3.0
---
-canShfl :: DeviceProperties -> Bool
-canShfl dev = CUDA.computeCapability dev >= Compute 3 0
-
 
 shfl :: ShuffleOp
      -> TypeR a
diff --git a/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Fold.hs b/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Fold.hs
index 25064d016..abbca01ba 100644
--- a/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Fold.hs
+++ b/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Fold.hs
@@ -21,7 +21,6 @@ module Data.Array.Accelerate.LLVM.PTX.CodeGen.Fold
   where
 
 import Data.Array.Accelerate.Representation.Array
-import Data.Array.Accelerate.Representation.Elt
 import Data.Array.Accelerate.Representation.Shape                   hiding ( size )
 import Data.Array.Accelerate.Representation.Type
 
@@ -133,14 +132,10 @@ mkFoldAllS uid dev aenv tp combine mseed marr =
       paramEnv            = envParam aenv
       --
       config              = launchConfig dev (CUDA.incWarp dev) smem multipleOf multipleOfQ
-      smem n
-        | canShfl dev     = sharedMemorySizeAdd tp warps 0
-        | otherwise       = warps * (1 + per_warp) * bytes -- This does not take alignment into account
+      smem n              = sharedMemorySizeAdd tp warps 0
         where
           ws        = CUDA.warpSize dev
           warps     = n `P.quot` ws
-          per_warp  = ws + ws `P.quot` 2
-          bytes     = bytesElt tp
   in
   makeOpenAccWith config uid "foldAllS" (paramOut ++ paramIn ++ paramEnv) $ do
 
@@ -194,14 +189,10 @@ mkFoldAllM1 uid dev aenv tp combine marr =
       start               = liftInt 0
       --
       config              = launchConfig dev (CUDA.incWarp dev) smem const [|| const ||]
-      smem n
-        | canShfl dev     = sharedMemorySizeAdd tp warps 0
-        | otherwise       = warps * (1 + per_warp) * bytes -- This does not take alignment into account
+      smem n              = sharedMemorySizeAdd tp warps 0
         where
           ws        = CUDA.warpSize dev
           warps     = n `P.quot` ws
-          per_warp  = ws + ws `P.quot` 2
-          bytes     = bytesElt tp
   in
   makeOpenAccWith config uid "foldAllM1" (paramTmp ++ paramIn ++ paramEnv) $ do
 
@@ -253,14 +244,10 @@ mkFoldAllM2 uid dev aenv tp combine mseed =
       start               = liftInt 0
       --
       config              = launchConfig dev (CUDA.incWarp dev) smem const [|| const ||]
-      smem n
-        | canShfl dev     = sharedMemorySizeAdd tp warps 0
-        | otherwise       = warps * (1 + per_warp) * bytes -- This does not take alignment into account
+      smem n              = sharedMemorySizeAdd tp warps 0
         where
           ws        = CUDA.warpSize dev
           warps     = n `P.quot` ws
-          per_warp  = ws + ws `P.quot` 2
-          bytes     = bytesElt tp
   in
   makeOpenAccWith config uid "foldAllM2" (paramTmp ++ paramOut ++ paramEnv) $ do
 
@@ -324,14 +311,10 @@ mkFoldDim uid aenv repr@(ArrayR shr tp) combine mseed marr = do
       paramEnv            = envParam aenv
       --
       config              = launchConfig dev (CUDA.incWarp dev) smem const [|| const ||]
-      smem n
-        | canShfl dev     = sharedMemorySizeAdd tp warps 0
-        | otherwise       = warps * (1 + per_warp) * bytes -- This does not take alignment into account
+      smem n              = sharedMemorySizeAdd tp warps 0
         where
           ws        = CUDA.warpSize dev
           warps     = n `P.quot` ws
-          per_warp  = ws + ws `P.quot` 2
-          bytes     = bytesElt tp
   --
   makeOpenAccWith config uid "fold" (paramOut ++ paramIn ++ paramEnv) $ do
 
@@ -435,19 +418,6 @@ mkFoldFill uid aenv repr seed =
   mkGenerate uid aenv repr (IRFun1 (const seed))
 
 
-reduceBlock
-    :: forall aenv e.
-       DeviceProperties                         -- ^ properties of the target device
-    -> TypeR e
-    -> IRFun2 PTX aenv (e -> e -> e)            -- ^ combination function
-    -> Maybe (Operands Int32)                   -- ^ number of valid elements (may be less than block size)
-    -> Operands e                               -- ^ calling thread's input element
-    -> CodeGen PTX (Operands e)                 -- ^ thread-block-wide reduction using the specified operator (lane 0 only)
-reduceBlock dev
-  | canShfl dev = reduceBlockShfl dev
-  | otherwise   = reduceBlockSMem dev
-
-
 -- Efficient threadblock-wide reduction using the specified operator. The
 -- aggregate reduction value is stored in thread zero. Supports non-commutative
 -- operators.
@@ -456,7 +426,7 @@ reduceBlock dev
 --
 -- Example: https://github.com/NVlabs/cub/blob/1.5.2/cub/block/specializations/block_reduce_warp_reductions.cuh
 --
-reduceBlockSMem
+reduceBlock
     :: forall aenv e.
        DeviceProperties                         -- ^ properties of the target device
     -> TypeR e
@@ -464,146 +434,7 @@ reduceBlockSMem
     -> Maybe (Operands Int32)                   -- ^ number of valid elements (may be less than block size)
     -> Operands e                               -- ^ calling thread's input element
     -> CodeGen PTX (Operands e)                 -- ^ thread-block-wide reduction using the specified operator (lane 0 only)
-reduceBlockSMem dev tp combine size = warpReduce >=> warpAggregate
-  where
-    int32 :: Integral a => a -> Operands Int32
-    int32 = liftInt32 . P.fromIntegral
-
-    -- Temporary storage required for each warp
-    bytes           = bytesElt tp
-    warp_smem_elems = CUDA.warpSize dev + (CUDA.warpSize dev `P.quot` 2)
-
-    -- Step 1: Reduction in every warp
-    --
-    warpReduce :: Operands e -> CodeGen PTX (Operands e)
-    warpReduce input = do
-      -- Allocate (1.5 * warpSize) elements of shared memory for each warp
-      wid   <- warpId
-      skip  <- A.mul numType wid (int32 (warp_smem_elems * bytes)) -- This does not take alignment into account
-      smem  <- dynamicSharedMem tp TypeInt32 (int32 warp_smem_elems) skip
-
-      -- Are we doing bounds checking for this warp?
-      --
-      case size of
-        -- The entire thread block is valid, so skip bounds checks.
-        Nothing ->
-          reduceWarpSMem dev tp combine smem Nothing input
-
-        -- Otherwise check how many elements are valid for this warp. If it is
-        -- full then we can still skip bounds checks for it.
-        Just n -> do
-          offset <- A.mul numType wid (int32 (CUDA.warpSize dev))
-          valid  <- A.sub numType n offset
-          if (tp, A.gte singleType valid (int32 (CUDA.warpSize dev)))
-            then reduceWarpSMem dev tp combine smem Nothing      input
-            else reduceWarpSMem dev tp combine smem (Just valid) input
-
-    -- Step 2: Aggregate per-warp reductions
-    --
-    warpAggregate :: Operands e -> CodeGen PTX (Operands e)
-    warpAggregate input = do
-      -- Allocate #warps elements of shared memory
-      bd    <- blockDim
-      warps <- A.quot integralType bd (int32 (CUDA.warpSize dev))
-      skip  <- A.mul numType warps (int32 (warp_smem_elems * bytes)) -- This does not take alignment into account
-      smem  <- dynamicSharedMem tp TypeInt32 warps skip
-
-      -- Share the per-lane aggregates
-      wid   <- warpId
-      lane  <- laneId
-      when (A.eq singleType lane (liftInt32 0)) $ do
-        writeArray TypeInt32 smem wid input
-
-      -- Wait for each warp to finish its local reduction
-      __syncthreads
-
-      -- Update the total aggregate. Thread 0 just does this sequentially (as is
-      -- done in CUB), but we could also do this cooperatively (better for
-      -- larger thread blocks?)
-      tid   <- threadIdx
-      if (tp, A.eq singleType tid (liftInt32 0))
-        then do
-          steps <- case size of
-                     Nothing -> return warps
-                     Just n  -> do
-                       a <- A.add numType n (int32 (CUDA.warpSize dev - 1))
-                       b <- A.quot integralType a (int32 (CUDA.warpSize dev))
-                       return b
-          iterFromStepTo tp (liftInt32 1) (liftInt32 1) steps input $ \step x ->
-            app2 combine x =<< readArray TypeInt32 smem step
-        else
-          return input
-
-
--- Efficient warp-wide reduction using shared memory. The aggregate reduction
--- value for the warp is stored in thread lane zero.
---
--- Each warp requires 48 (1.5 x warp size) elements of shared memory. The
--- routine assumes that is is allocated individually per-warp (i.e. can be
--- indexed in the range [0,warp size)).
---
--- Example: https://github.com/NVlabs/cub/blob/1.5.2/cub/warp/specializations/warp_reduce_smem.cuh#L128
---
-reduceWarpSMem
-    :: forall aenv e.
-       DeviceProperties                         -- ^ properties of the target device
-    -> TypeR e
-    -> IRFun2 PTX aenv (e -> e -> e)            -- ^ combination function
-    -> IRArray (Vector e)                       -- ^ temporary storage array in shared memory (1.5 warp size elements)
-    -> Maybe (Operands Int32)                   -- ^ number of items that will be reduced by this warp, otherwise all lanes are valid
-    -> Operands e                               -- ^ calling thread's input element
-    -> CodeGen PTX (Operands e)                 -- ^ warp-wide reduction using the specified operator (lane 0 only)
-reduceWarpSMem dev tp combine smem size = reduce 0
-  where
-    log2 :: Double -> Double
-    log2  = P.logBase 2
-
-    -- Number steps required to reduce warp
-    steps = P.floor . log2 . P.fromIntegral . CUDA.warpSize $ dev
-
-    -- Return whether the index is valid. Assume that constant branches are
-    -- optimised away.
-    valid i =
-      case size of
-        Nothing -> return (liftBool True)
-        Just n  -> A.lt singleType i n
-
-    -- Unfold the reduction as a recursive code generation function.
-    reduce :: Int -> Operands e -> CodeGen PTX (Operands e)
-    reduce step x
-      | step >= steps = return x
-      | otherwise     = do
-          let offset = liftInt32 (1 `P.shiftL` step)
-
-          -- share input through buffer
-          lane <- laneId
-          writeArray TypeInt32 smem lane x
-
-          __syncwarp
-
-          -- update input if in range
-          i   <- A.add numType lane offset
-          x'  <- if (tp, valid i)
-                   then app2 combine x =<< readArray TypeInt32 smem i
-                   else return x
-
-          __syncwarp
-
-          reduce (step+1) x'
-
-
-
--- Equivalent to `reduceBlockSMem` but using warp shuffle instructions
---
-reduceBlockShfl
-    :: forall aenv e.
-       DeviceProperties                         -- ^ properties of the target device
-    -> TypeR e
-    -> IRFun2 PTX aenv (e -> e -> e)            -- ^ combination function
-    -> Maybe (Operands Int32)                         -- ^ number of valid elements (may be less than block size)
-    -> Operands e                                     -- ^ calling thread's input element
-    -> CodeGen PTX (Operands e)                       -- ^ thread-block-wide reduction using the specified operator (lane 0 only)
-reduceBlockShfl dev tp combine size = warpReduce >=> warpAggregate
+reduceBlock dev tp combine size = warpReduce >=> warpAggregate
   where
     int32 :: Integral a => a -> Operands Int32
     int32 = liftInt32 . P.fromIntegral
@@ -616,7 +447,7 @@ reduceBlockShfl dev tp combine size = warpReduce >=> warpAggregate
       -- Are we doing bounds checking for this warp?
       case size of
         -- The entire thread block is valid, so skip bounds checks.
-        Nothing -> reduceWarpShfl dev tp combine Nothing input
+        Nothing -> reduceWarp dev tp combine Nothing input
 
         -- Otherwise check how many elements are valid for this warp. If it is
         -- full then we can still skip bounds checks for it.
@@ -624,8 +455,8 @@ reduceBlockShfl dev tp combine size = warpReduce >=> warpAggregate
           offset <- A.mul numType wid (int32 (CUDA.warpSize dev))
           valid  <- A.sub numType n offset
           if (tp, A.gte singleType valid (int32 (CUDA.warpSize dev)))
-            then reduceWarpShfl dev tp combine Nothing      input
-            else reduceWarpShfl dev tp combine (Just valid) input
+            then reduceWarp dev tp combine Nothing      input
+            else reduceWarp dev tp combine (Just valid) input
 
     -- Step 2: Aggregate per-warp reductions
     --
@@ -672,16 +503,23 @@ reduceBlockShfl dev tp combine size = warpReduce >=> warpAggregate
           return input
 
 
--- Equivalent to 'reduceWarpSmem' but using warp shuffle instructions
+-- Efficient warp-wide reduction using shared memory. The aggregate reduction
+-- value for the warp is stored in thread lane zero.
+--
+-- Each warp requires 48 (1.5 x warp size) elements of shared memory. The
+-- routine assumes that is is allocated individually per-warp (i.e. can be
+-- indexed in the range [0,warp size)).
+--
+-- Example: https://github.com/NVlabs/cub/blob/1.5.2/cub/warp/specializations/warp_reduce_smem.cuh#L128
 --
-reduceWarpShfl
+reduceWarp
     :: forall e aenv. DeviceProperties
     -> TypeR e
     -> IRFun2 PTX aenv (e -> e -> e)                  -- ^ combination function
     -> Maybe (Operands Int32)                         -- ^ number of items that will be reduced by this warp, otherwise all lanes are valid
     -> Operands e                                     -- ^ this thread's input value
     -> CodeGen PTX (Operands e)                       -- ^ final result
-reduceWarpShfl dev typer combine size = reduce 0
+reduceWarp dev typer combine size = reduce 0
   where
     log2 :: Double -> Double
     log2  = P.logBase 2
@@ -738,7 +576,7 @@ reduceFromTo dev tp from to combine get set = do
                -- All threads in the block will participate in the reduction, so
                -- we can avoid bounds checks
                x <- get i
-               r <- reduceBlockShfl dev tp combine Nothing x
+               r <- reduceBlock dev tp combine Nothing x
                set r
 
                return (lift TupRunit ())
@@ -748,7 +586,7 @@ reduceFromTo dev tp from to combine get set = do
                when (A.lt singleType i to) $ do
                  x <- get i
                  v <- i32 valid
-                 r <- reduceBlockShfl dev tp combine (Just v) x
+                 r <- reduceBlock dev tp combine (Just v) x
                  set r
 
                return (lift TupRunit ())
diff --git a/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/FoldSeg.hs b/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/FoldSeg.hs
index 9dfc6e11f..259a49cd3 100644
--- a/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/FoldSeg.hs
+++ b/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/FoldSeg.hs
@@ -20,7 +20,6 @@ module Data.Array.Accelerate.LLVM.PTX.CodeGen.FoldSeg
   where
 
 import Data.Array.Accelerate.Representation.Array
-import Data.Array.Accelerate.Representation.Elt
 import Data.Array.Accelerate.Representation.Shape
 import Data.Array.Accelerate.Representation.Type
 
@@ -38,7 +37,7 @@ import Data.Array.Accelerate.LLVM.Compile.Cache
 
 import Data.Array.Accelerate.LLVM.PTX.Analysis.Launch
 import Data.Array.Accelerate.LLVM.PTX.CodeGen.Base
-import Data.Array.Accelerate.LLVM.PTX.CodeGen.Fold                  ( reduceBlock, reduceWarpShfl, reduceWarpSMem, imapFromTo )
+import qualified Data.Array.Accelerate.LLVM.PTX.CodeGen.Fold        as Fold ( reduceBlock, reduceWarp, imapFromTo )
 import Data.Array.Accelerate.LLVM.PTX.Target
 
 import LLVM.AST.Type.Representation
@@ -48,7 +47,6 @@ import qualified Foreign.CUDA.Analysis                              as CUDA
 import Control.Monad                                                ( void )
 import Control.Monad.State                                          ( gets )
 import Data.String                                                  ( fromString )
-import Foreign.Storable                                             ( sizeOf )
 import Prelude                                                      as P
 
 
@@ -99,14 +97,10 @@ mkFoldSegP_block uid aenv repr@(ArrayR shr tp) intTp combine mseed marr mseg = d
       paramEnv            = envParam aenv
       --
       config              = launchConfig dev (CUDA.decWarp dev) dsmem const [|| const ||]
-      dsmem n
-        | canShfl dev     = sharedMemorySizeAdd tp warps 0
-        | otherwise       = warps * (1 + per_warp) * bytes -- This does not take alignment into account
+      dsmem n             = sharedMemorySizeAdd tp warps 0
         where
           ws        = CUDA.warpSize dev
           warps     = n `P.quot` ws
-          per_warp  = ws + ws `P.quot` 2
-          bytes     = bytesElt tp
   --
   makeOpenAccWith config uid "foldSeg_block" (paramOut ++ paramIn ++ paramSeg ++ paramEnv) $ do
 
@@ -143,7 +137,7 @@ mkFoldSegP_block uid aenv repr@(ArrayR shr tp) intTp combine mseed marr mseg = d
     start <- return (liftInt 0)
     end   <- shapeSize shr (irArrayShape arrOut)
 
-    imapFromTo start end $ \s -> do
+    Fold.imapFromTo start end $ \s -> do
 
       -- The first two threads of the block determine the indices of the
       -- segments array that we will reduce between and distribute those values
@@ -211,8 +205,8 @@ mkFoldSegP_block uid aenv repr@(ArrayR shr tp) intTp combine mseed marr mseg = d
             v0  <- A.sub numType sup inf
             v0' <- i32 v0
             r0  <- if (tp, A.gte singleType v0 bd)
-                     then reduceBlock dev tp combine Nothing    x0
-                     else reduceBlock dev tp combine (Just v0') x0
+                     then Fold.reduceBlock dev tp combine Nothing    x0
+                     else Fold.reduceBlock dev tp combine (Just v0') x0
 
             -- Step 2: keep walking over the input
             nxt <- A.add numType inf bd
@@ -228,7 +222,7 @@ mkFoldSegP_block uid aenv repr@(ArrayR shr tp) intTp combine mseed marr mseg = d
                              -- can avoid bounds checks.
                              then do
                                x <- app1 (delayedLinearIndex arrIn) i'
-                               y <- reduceBlock dev tp combine Nothing x
+                               y <- Fold.reduceBlock dev tp combine Nothing x
                                return y
 
                              -- Not all threads are valid. Note that we still
@@ -247,7 +241,7 @@ mkFoldSegP_block uid aenv repr@(ArrayR shr tp) intTp combine mseed marr mseg = d
                                            return $ go tp
 
                                z <- i32 v'
-                               y <- reduceBlock dev tp combine (Just z) x
+                               y <- Fold.reduceBlock dev tp combine (Just z) x
                                return y
 
                      -- first thread incorporates the result from the previous
@@ -298,34 +292,15 @@ mkFoldSegP_warp uid aenv repr@(ArrayR shr tp) intTp combine mseed marr mseg = do
       paramEnv            = envParam aenv
       --
       config              = launchConfig dev (CUDA.decWarp dev) dsmem grid gridQ
-      dsmem n             = warps * per_warp_bytes
-        where
-          warps           = (n + ws - 1) `P.quot` ws
+        where dsmem _n    = 0
       --
       grid n m            = multipleOf n (m `P.quot` ws)
       gridQ               = [|| \n m -> $$multipleOfQ n (m `P.quot` ws) ||]
       --
-      per_warp_bytes
-        | canShfl dev     = 0
-        | otherwise       = (2 * sizeOf (undefined::Int)) `P.max` (bytesElt tp * per_warp_elems) -- This does not take alignment into account
-      per_warp_elems      = ws + (ws `P.quot` 2)
       ws                  = CUDA.warpSize dev
 
       int32 :: Integral a => a -> Operands Int32
       int32 = liftInt32 . P.fromIntegral
-
-      unsmem :: TypeR a -> IRArray (Vector a)
-      unsmem aR =
-        let go :: TypeR s -> Operands s
-            go TupRunit         = OP_Unit
-            go (TupRpair t1 t2) = OP_Pair (go t1) (go t2)
-            go (TupRsingle t)   = ir t (undef t)
-        in
-        IRArray (ArrayR dim1 aR)
-                (OP_Pair OP_Unit (ir integralType (integral integralType 0)))
-                (go aR)
-                sharedMemAddrSpace
-                sharedMemVolatility
   --
   makeOpenAccWith config uid "foldSeg_warp" (paramOut ++ paramIn ++ paramSeg ++ paramEnv) $ do
 
@@ -344,37 +319,6 @@ mkFoldSegP_warp uid aenv repr@(ArrayR shr tp) intTp combine mseed marr mseg = do
                 b <- A.add numType wid a
                 return b
 
-    -- All threads in the warp need to know what the start and end indices of
-    -- this segment are in order to participate in the reduction. We use
-    -- variables in __shared__ memory to communicate these values between
-    -- threads. Furthermore, by using a 2-element array, we can have the first
-    -- two threads of the warp read the start and end indices as a single
-    -- coalesced read, as these elements will be adjacent in the segment-offset
-    -- array.
-    --
-    -- Note that this is aliased with the memory used to communicate reduction
-    -- values within the warp.
-    --
-    lim   <- case canShfl dev of
-      True  -> return (unsmem (TupRsingle scalarTypeInt))
-      False -> do
-        a <- A.mul numType wid (int32 per_warp_bytes) -- This does not take alignment into account
-        b <- dynamicSharedMem (TupRsingle scalarTypeInt) TypeInt32 (liftInt32 0) a
-        return b
-
-    -- Allocate (1.5 * warpSize) elements of shared memory for each warp to
-    -- communicate reduction values in the SMem kernel.
-    --
-    -- Note that this is aliased with the memory used to communicate the start
-    -- and end indices of this segment.
-    --
-    smem  <- case canShfl dev of
-      True  -> return (unsmem tp)
-      False -> do
-        a <- A.mul numType wid (int32 per_warp_bytes) -- This does not take alignment into account
-        b <- dynamicSharedMem tp TypeInt32 (int32 per_warp_elems) a
-        return b
-
     -- Compute the number of segments and size of the innermost dimension. These
     -- are required if we are reducing a rank-2 or higher array, to properly
     -- compute the start and end indices of the portion of the array this warp
@@ -408,9 +352,6 @@ mkFoldSegP_warp uid aenv repr@(ArrayR shr tp) intTp combine mseed marr mseg = do
                    b <- A.add numType a =<< int lane
                    c <- app1 (delayedLinearIndex arrSeg) b
                    d <- A.fromIntegral intTp numType c
-                   case canShfl dev of
-                     True  -> return ()
-                     False -> writeArray TypeInt32 lim lane d
                    return d
                  else
                    return (ir integralType (undef scalarType))
@@ -420,13 +361,9 @@ mkFoldSegP_warp uid aenv repr@(ArrayR shr tp) intTp combine mseed marr mseg = do
       -- Determine the index range of the input array we will reduce over.
       -- Necessary for multidimensional segmented reduction.
       (inf,sup) <- do
-        u <- case canShfl dev of
-               True  -> __shfl_idx (TupRsingle scalarTypeInt) idx (liftWord32 0)
-               False -> readArray TypeInt32 lim (liftInt32 0)
+        u <- __shfl_idx (TupRsingle scalarTypeInt) idx (liftWord32 0)
 
-        v <- case canShfl dev of
-               True  -> __shfl_idx (TupRsingle scalarTypeInt) idx (liftWord32 1)
-               False -> readArray TypeInt32 lim (liftInt32 1)
+        v <- __shfl_idx (TupRsingle scalarTypeInt) idx (liftWord32 1)
         A.unpair <$> case shr of
                        ShapeRsnoc ShapeRz -> return (A.pair u v)
                        _ -> do q <- A.quot integralType s ss
@@ -467,8 +404,8 @@ mkFoldSegP_warp uid aenv repr@(ArrayR shr tp) intTp combine mseed marr mseg = do
             v0  <- A.sub numType sup inf
             v0' <- i32 v0
             r0  <- if (tp, A.gte singleType v0 (liftInt ws))
-                     then reduceWarp dev tp combine smem Nothing    x0
-                     else reduceWarp dev tp combine smem (Just v0') x0
+                     then reduceWarp dev tp combine Nothing    x0
+                     else reduceWarp dev tp combine (Just v0') x0
 
             -- Step 2: Keep walking over the rest of the segment
             nx  <- A.add numType inf (liftInt ws)
@@ -483,7 +420,7 @@ mkFoldSegP_warp uid aenv repr@(ArrayR shr tp) intTp combine mseed marr mseg = do
                             then do
                               -- All lanes are in bounds, so avoid bounds checks
                               x <- app1 (delayedLinearIndex arrIn) i'
-                              y <- reduceWarp dev tp combine smem Nothing x
+                              y <- reduceWarp dev tp combine Nothing x
                               return y
 
                             else do
@@ -498,7 +435,7 @@ mkFoldSegP_warp uid aenv repr@(ArrayR shr tp) intTp combine mseed marr mseg = do
                                           return $ go tp
 
                               z <- i32 v'
-                              y <- reduceWarp dev tp combine smem (Just z) x
+                              y <- reduceWarp dev tp combine (Just z) x
                               return y
 
                     -- The first lane incorporates the result from the previous
@@ -532,11 +469,8 @@ reduceWarp
        DeviceProperties                         -- ^ properties of the target device
     -> TypeR e
     -> IRFun2 PTX aenv (e -> e -> e)            -- ^ combination function
-    -> IRArray (Vector e)                       -- ^ temporary storage array in shared memory (1.5 warp size elements)
     -> Maybe (Operands Int32)                   -- ^ number of items that will be reduced by this warp, otherwise all lanes are valid
     -> Operands e                               -- ^ calling thread's input element
     -> CodeGen PTX (Operands e)                 -- ^ warp-wide reduction using the specified operator (lane 0 only)
-reduceWarp dev t c smem
-  | canShfl dev = reduceWarpShfl dev t c
-  | otherwise   = reduceWarpSMem dev t c smem
+reduceWarp dev t c = Fold.reduceWarp dev t c
 
diff --git a/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Scan.hs b/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Scan.hs
index 618257677..1e4c3bdc3 100644
--- a/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Scan.hs
+++ b/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/CodeGen/Scan.hs
@@ -26,7 +26,6 @@ module Data.Array.Accelerate.LLVM.PTX.CodeGen.Scan (
 
 import Data.Array.Accelerate.AST                                    ( Direction(..) )
 import Data.Array.Accelerate.Representation.Array
-import Data.Array.Accelerate.Representation.Elt
 import Data.Array.Accelerate.Representation.Shape
 import Data.Array.Accelerate.Representation.Type
 
@@ -1134,32 +1133,14 @@ mkScan'Fill uid aenv repr seed =
   Safe.coerce <$> mkGenerate uid aenv (reduceRank repr) (IRFun1 (const seed))
 
 scanSMemSize :: DeviceProperties -> TypeR e -> Int -> Int
-scanSMemSize dev tp n
-  | canShfl dev = sharedMemorySizeAdd tp warps 0
-  | otherwise   = warps * (1 + per_warp) * bytes -- This does not take alignment into account
+scanSMemSize dev tp n = sharedMemorySizeAdd tp warps 0
   where
     ws        = CUDA.warpSize dev
     warps     = n `P.quot` ws
-    per_warp  = ws + ws `P.quot` 2
-    bytes     = bytesElt tp
 
 -- Block wide scan
 -- ---------------
 
-scanBlock
-    :: forall aenv e.
-       Direction
-    -> DeviceProperties                             -- ^ properties of the target device
-    -> TypeR e
-    -> IRFun2 PTX aenv (e -> e -> e)                -- ^ combination function
-    -> Maybe (Operands Int32)                       -- ^ number of valid elements (may be less than block size)
-    -> Operands e                                   -- ^ calling thread's input element
-    -> CodeGen PTX (Operands e)
-scanBlock dir dev
-  | canShfl dev = scanBlockShfl dir dev -- shfl instruction available in compute >= 3.0
-  | otherwise   = scanBlockSMem dir dev -- equivalent, slightly slower version
-
-
 -- Efficient block-wide (inclusive) scan using the specified operator.
 --
 -- Each block requires (#warps * (1 + 1.5*warp size)) elements of dynamically
@@ -1167,17 +1148,7 @@ scanBlock dir dev
 --
 -- Example: https://github.com/NVlabs/cub/blob/1.5.4/cub/block/specializations/block_scan_warp_scans.cuh
 --
--- NOTE: [Synchronisation problems with SM_70 and greater]
---
--- This operation uses thread synchronisation. When calling this operation, it
--- is important that all active (that is, non-exited) threads of the thread
--- block participate. It seems that sm_70+ (devices with independent thread
--- scheduling) are stricter about the requirement that all non-existed threads
--- participate in every barrier.
---
--- See: https://github.com/AccelerateHS/accelerate/issues/436
---
-scanBlockSMem
+scanBlock
     :: forall aenv e.
        Direction
     -> DeviceProperties                             -- ^ properties of the target device
@@ -1186,24 +1157,18 @@ scanBlockSMem
     -> Maybe (Operands Int32)                       -- ^ number of valid elements (may be less than block size)
     -> Operands e                                   -- ^ calling thread's input element
     -> CodeGen PTX (Operands e)
-scanBlockSMem dir dev tp combine nelem = warpScan >=> warpPrefix
+scanBlock dir dev tp combine nelem = warpScan >=> warpPrefix
   where
     int32 :: Integral a => a -> Operands Int32
     int32 = liftInt32 . P.fromIntegral
 
     -- Temporary storage required for each warp
-    warp_smem_elems = CUDA.warpSize dev + (CUDA.warpSize dev `P.quot` 2)
-    warp_smem_bytes = warp_smem_elems  * bytesElt tp
+    -- warp_smem_elems = CUDA.warpSize dev + (CUDA.warpSize dev `P.quot` 2)
+    -- warp_smem_bytes = warp_smem_elems  * bytesElt tp
 
     -- Step 1: Scan in every warp
     warpScan :: Operands e -> CodeGen PTX (Operands e)
-    warpScan input = do
-      -- Allocate (1.5 * warpSize) elements of shared memory for each warp
-      -- (individually addressable by each warp)
-      wid   <- warpId
-      skip  <- A.mul numType wid (int32 warp_smem_bytes) -- This does not take alignment into account
-      smem  <- dynamicSharedMem tp TypeInt32 (int32 warp_smem_elems) skip
-      scanWarpSMem dir dev tp combine smem input
+    warpScan = scanWarp dir dev tp combine
 
     -- Step 2: Collect the aggregate results of each warp to compute the prefix
     -- values for each warp and combine with the partial result to compute each
@@ -1213,8 +1178,7 @@ scanBlockSMem dir dev tp combine nelem = warpScan >=> warpPrefix
       -- Allocate #warps elements of shared memory
       bd    <- blockDim
       warps <- A.quot integralType bd (int32 (CUDA.warpSize dev))
-      skip  <- A.mul numType warps (int32 warp_smem_bytes) -- This does not take alignment into account
-      smem  <- dynamicSharedMem tp TypeInt32 warps skip
+      smem  <- dynamicSharedMem tp TypeInt32 warps (liftInt32 0)
 
       -- Share warp aggregates
       wid   <- warpId
@@ -1234,7 +1198,7 @@ scanBlockSMem dir dev tp combine nelem = warpScan >=> warpPrefix
           -- their prefix value. We do this sequentially, but could also have
           -- warp 0 do it cooperatively if we limit thread block sizes to
           -- (warp size ^ 2).
-          steps  <- case nelem of
+          steps <- case nelem of
                       Nothing -> return wid
                       Just n  -> A.min singleType wid =<< A.quot integralType n (int32 (CUDA.warpSize dev))
 
@@ -1261,121 +1225,7 @@ scanBlockSMem dir dev tp combine nelem = warpScan >=> warpPrefix
 --
 -- Example: https://github.com/NVlabs/cub/blob/1.5.4/cub/warp/specializations/warp_scan_smem.cuh
 --
-scanWarpSMem
-    :: forall aenv e.
-       Direction
-    -> DeviceProperties                             -- ^ properties of the target device
-    -> TypeR e
-    -> IRFun2 PTX aenv (e -> e -> e)                -- ^ combination function
-    -> IRArray (Vector e)                           -- ^ temporary storage array in shared memory (1.5 x warp size elements)
-    -> Operands e                                   -- ^ calling thread's input element
-    -> CodeGen PTX (Operands e)
-scanWarpSMem dir dev tp combine smem = scan 0
-  where
-    log2 :: Double -> Double
-    log2 = P.logBase 2
-
-    -- Number of steps required to scan warp
-    steps     = P.floor (log2 (P.fromIntegral (CUDA.warpSize dev)))
-    halfWarp  = P.fromIntegral (CUDA.warpSize dev `P.quot` 2)
-
-    -- Unfold the scan as a recursive code generation function
-    scan :: Int -> Operands e -> CodeGen PTX (Operands e)
-    scan step x
-      | step >= steps = return x
-      | otherwise     = do
-          let offset = liftInt32 (1 `P.shiftL` step)
-
-          -- share partial result through shared memory buffer
-          lane <- laneId
-          i    <- A.add numType lane (liftInt32 halfWarp)
-          writeArray TypeInt32 smem i x
-
-          __syncwarp
-
-          -- update partial result if in range
-          x'   <- if (tp, A.gte singleType lane offset)
-                    then do
-                      i' <- A.sub numType i offset    -- lane + HALF_WARP - offset
-                      x' <- readArray TypeInt32 smem i'
-                      case dir of
-                        LeftToRight -> app2 combine x' x
-                        RightToLeft -> app2 combine x x'
-
-                    else
-                      return x
-
-          __syncwarp
-
-          scan (step+1) x'
-
-
-scanBlockShfl
-    :: forall aenv e.
-       Direction
-    -> DeviceProperties                             -- ^ properties of the target device
-    -> TypeR e
-    -> IRFun2 PTX aenv (e -> e -> e)                -- ^ combination function
-    -> Maybe (Operands Int32)                       -- ^ number of valid elements (may be less than block size)
-    -> Operands e                                   -- ^ calling thread's input element
-    -> CodeGen PTX (Operands e)
-scanBlockShfl dir dev tp combine nelem = warpScan >=> warpPrefix
-  where
-    int32 :: Integral a => a -> Operands Int32
-    int32 = liftInt32 . P.fromIntegral
-
-    -- Temporary storage required for each warp
-    -- warp_smem_elems = CUDA.warpSize dev + (CUDA.warpSize dev `P.quot` 2)
-    -- warp_smem_bytes = warp_smem_elems  * bytesElt tp
-
-    -- Step 1: Scan in every warp
-    warpScan :: Operands e -> CodeGen PTX (Operands e)
-    warpScan = scanWarpShfl dir dev tp combine
-
-    -- Step 2: Collect the aggregate results of each warp to compute the prefix
-    -- values for each warp and combine with the partial result to compute each
-    -- thread's final value.
-    warpPrefix :: Operands e -> CodeGen PTX (Operands e)
-    warpPrefix input = do
-      -- Allocate #warps elements of shared memory
-      bd    <- blockDim
-      warps <- A.quot integralType bd (int32 (CUDA.warpSize dev))
-      smem  <- dynamicSharedMem tp TypeInt32 warps (liftInt32 0)
-
-      -- Share warp aggregates
-      wid   <- warpId
-      lane  <- laneId
-      when (A.eq singleType lane (int32 (CUDA.warpSize dev - 1))) $ do
-        writeArray TypeInt32 smem wid input
-
-      -- Wait for each warp to finish its local scan and share the aggregate
-      __syncthreads
-
-      -- Compute the prefix value for this warp and add to the partial result.
-      -- This step is not required for the first warp, which has no carry-in.
-      if (tp, A.eq singleType wid (liftInt32 0))
-        then return input
-        else do
-          -- Every thread sequentially scans the warp aggregates to compute
-          -- their prefix value. We do this sequentially, but could also have
-          -- warp 0 do it cooperatively if we limit thread block sizes to
-          -- (warp size ^ 2).
-          steps <- case nelem of
-                      Nothing -> return wid
-                      Just n  -> A.min singleType wid =<< A.quot integralType n (int32 (CUDA.warpSize dev))
-
-          p0     <- readArray TypeInt32 smem (liftInt32 0)
-          prefix <- iterFromStepTo tp (liftInt32 1) (liftInt32 1) steps p0 $ \step x -> do
-                      y <- readArray TypeInt32 smem step
-                      case dir of
-                        LeftToRight -> app2 combine x y
-                        RightToLeft -> app2 combine y x
-
-          case dir of
-            LeftToRight -> app2 combine prefix input
-            RightToLeft -> app2 combine input prefix
-
-scanWarpShfl
+scanWarp
     :: forall aenv e.
        Direction
     -> DeviceProperties                             -- ^ properties of the target device
@@ -1383,7 +1233,7 @@ scanWarpShfl
     -> IRFun2 PTX aenv (e -> e -> e)                -- ^ combination function
     -> Operands e                                   -- ^ calling thread's input element
     -> CodeGen PTX (Operands e)
-scanWarpShfl dir dev tp combine = scan 0
+scanWarp dir dev tp combine = scan 0
   where
     log2 :: Double -> Double
     log2 = P.logBase 2
diff --git a/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/Compile.hs b/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/Compile.hs
index 534c0a246..81ec1acd3 100644
--- a/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/Compile.hs
+++ b/accelerate-llvm-ptx/src/Data/Array/Accelerate/LLVM/PTX/Compile.hs
@@ -108,6 +108,10 @@ compile pacc aenv = do
   Module ast md     <- llvmOfPreOpenAcc uid pacc aenv
   let config        = [ (f,x) | (LLVM.Name f, KM_PTX x) <- HashMap.toList md ]
 
+  case isDeviceSupported (CUDA.computeCapability dev) of
+    Nothing -> return ()  -- all fine
+    Just err -> internalError string err
+
   -- Lower the generated LLVM into a CUBIN object code.
   --
   -- The 'objData' field is lazily evaluated since the object code might have
@@ -295,3 +299,11 @@ moduleTargetAssembly tm m = unsafe0 =<< LLVM.Internal.emitToByteString LLVM.Inte
           _ | B.isSpaceWord8 x -> poke p' 0 >> return bs
           _                    -> return (B.snoc bs 0)
 
+-- | Returns a human-readable error message in case the device is unsupported,
+-- and Nothing if everything is alright.
+isDeviceSupported :: CUDA.Compute -> Maybe String
+isDeviceSupported cc@(CUDA.Compute m _)
+  -- We require shfl instructions which are available only from CC 3.0.
+  | m >= 3 = Nothing
+  | otherwise = Just $
+      "Your GPU has compute capability " ++ show cc ++ ", but only >= 3.0 is supported."