diff --git a/lib/dynamic-character/src/DirectOptimization/Pairwise.hs b/lib/dynamic-character/src/DirectOptimization/Pairwise.hs index c4c956d02..16dc5cdde 100644 --- a/lib/dynamic-character/src/DirectOptimization/Pairwise.hs +++ b/lib/dynamic-character/src/DirectOptimization/Pairwise.hs @@ -1,32 +1,34 @@ ----------------------------------------------------------------------------- --- | --- Module : DirectOptimization.Pairwise --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- --- Pairwise direct optimization alignment functions using a variety of techniques. --- + ----------------------------------------------------------------------------- -module DirectOptimization.Pairwise - ( +{- | +Module : DirectOptimization.Pairwise +Copyright : (c) 2015-2021 Ward Wheeler +License : BSD-style + +Maintainer : wheeler@amnh.org +Stability : provisional +Portability : portable + +Pairwise direct optimization alignment functions using a variety of techniques. +-} +module DirectOptimization.Pairwise ( -- * Slim characters - SlimDynamicCharacter - , SlimState - , slimPairwiseDO + SlimDynamicCharacter, + SlimState, + slimPairwiseDO, + -- * Wide characters - , WideDynamicCharacter - , WideState - , widePairwiseDO + WideDynamicCharacter, + WideState, + widePairwiseDO, + -- * Huge characters - , HugeDynamicCharacter - , HugeState - , hugePairwiseDO - ) where + HugeDynamicCharacter, + HugeState, + hugePairwiseDO, +) where import DirectOptimization.Pairwise.Huge import DirectOptimization.Pairwise.Slim diff --git a/lib/dynamic-character/src/DirectOptimization/Pairwise/Direction.hs b/lib/dynamic-character/src/DirectOptimization/Pairwise/Direction.hs index 573d59478..7530ac40f 100644 --- a/lib/dynamic-character/src/DirectOptimization/Pairwise/Direction.hs +++ b/lib/dynamic-character/src/DirectOptimization/Pairwise/Direction.hs @@ -1,61 +1,63 @@ ----------------------------------------------------------------------------- --- | --- Module : DirectOptimization.Pairwise.Direction --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- --- Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. --- These functions will allocate an M * N matrix. --- ----------------------------------------------------------------------------- - -{-# LANGUAGE DerivingStrategies #-} -{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE DerivingStrategies #-} +{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE MultiParamTypeClasses #-} -{-# LANGUAGE Strict #-} -{-# LANGUAGE TypeFamilies #-} -{-# LANGUAGE UnboxedTuples #-} - -module DirectOptimization.Pairwise.Direction - ( Direction(..) - -- * Querying - , minimumCostDirection - -- * Rendering - , boldDirection - ) where - -import Data.Int -import qualified Data.Vector.Generic as GV -import qualified Data.Vector.Generic.Mutable as MGV -import qualified Data.Vector.Primitive as PV -import qualified Data.Vector.Unboxed as UV -import Data.Word - - --- | --- Which direction to align the character at a given matrix point. --- --- It should be noted that the ordering of the three arrow types are important, --- as it guarantees that the derived 'Ord' instance will have the following --- property: --- --- DiagArrow < LeftArrow < UpArrow --- --- This means: --- --- - DiagArrow has highest precedence when one or more costs are equal --- --- - LeftArrow has second highest precedence when one or more costs are equal --- --- - UpArrow has lowest precedence when one or more costs are equal --- --- Using this 'Ord' instance, we can resolve ambiguous transformations in a --- deterministic way. Without loss of generality in determining the ordering, --- we choose the same biasing as the C code called from the FFI for consistency. +{-# LANGUAGE Strict #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE UnboxedTuples #-} + +{- | +Module : DirectOptimization.Pairwise.Direction +Copyright : (c) 2015-2021 Ward Wheeler +License : BSD-style + +Maintainer : wheeler@amnh.org +Stability : provisional +Portability : portable + +Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. +These functions will allocate an M * N matrix. +-} +module DirectOptimization.Pairwise.Direction ( + Direction (..), + + -- * Querying + minimumCostDirection, + + -- * Rendering + boldDirection, +) where + +import Data.Int +import Data.Vector.Generic qualified as GV +import Data.Vector.Generic.Mutable qualified as MGV +import Data.Vector.Primitive qualified as PV +import Data.Vector.Unboxed qualified as UV +import Data.Word + + +{- | +Which direction to align the character at a given matrix point. + +It should be noted that the ordering of the three arrow types are important, +as it guarantees that the derived 'Ord' instance will have the following +property: + +DiagArrow < LeftArrow < UpArrow + +This means: + + - DiagArrow has highest precedence when one or more costs are equal + + - LeftArrow has second highest precedence when one or more costs are equal + + - UpArrow has lowest precedence when one or more costs are equal + +Using this 'Ord' instance, we can resolve ambiguous transformations in a +deterministic way. Without loss of generality in determining the ordering, +we choose the same biasing as the C code called from the FFI for consistency. +-} data Direction = DiagArrow | LeftArrow | UpArrow deriving stock (Eq, Ord) @@ -63,132 +65,148 @@ data Direction = DiagArrow | LeftArrow | UpArrow newtype instance UV.MVector s Direction = MV_Direction (PV.MVector s Word8) -newtype instance UV.Vector Direction = V_Direction (PV.Vector Word8) +newtype instance UV.Vector Direction = V_Direction (PV.Vector Word8) instance UV.Unbox Direction instance MGV.MVector UV.MVector Direction where - {-# INLINE basicLength #-} basicLength (MV_Direction v) = MGV.basicLength v + {-# INLINE basicUnsafeSlice #-} basicUnsafeSlice i n (MV_Direction v) = MV_Direction $ MGV.basicUnsafeSlice i n v + {-# INLINE basicOverlaps #-} basicOverlaps (MV_Direction v1) (MV_Direction v2) = MGV.basicOverlaps v1 v2 + {-# INLINE basicUnsafeNew #-} basicUnsafeNew n = MV_Direction <$> MGV.basicUnsafeNew n + {-# INLINE basicInitialize #-} basicInitialize (MV_Direction v) = MGV.basicInitialize v + {-# INLINE basicUnsafeReplicate #-} basicUnsafeReplicate n x = MV_Direction <$> MGV.basicUnsafeReplicate n (fromDirection x) + {-# INLINE basicUnsafeRead #-} basicUnsafeRead (MV_Direction v) i = toDirection <$> MGV.basicUnsafeRead v i + {-# INLINE basicUnsafeWrite #-} basicUnsafeWrite (MV_Direction v) i x = MGV.basicUnsafeWrite v i (fromDirection x) + {-# INLINE basicClear #-} basicClear (MV_Direction v) = MGV.basicClear v + {-# INLINE basicSet #-} basicSet (MV_Direction v) x = MGV.basicSet v (fromDirection x) + {-# INLINE basicUnsafeCopy #-} basicUnsafeCopy (MV_Direction v1) (MV_Direction v2) = MGV.basicUnsafeCopy v1 v2 + basicUnsafeMove (MV_Direction v1) (MV_Direction v2) = MGV.basicUnsafeMove v1 v2 + {-# INLINE basicUnsafeGrow #-} basicUnsafeGrow (MV_Direction v) n = MV_Direction <$> MGV.basicUnsafeGrow v n instance GV.Vector UV.Vector Direction where - {-# INLINE basicUnsafeFreeze #-} basicUnsafeFreeze (MV_Direction v) = V_Direction <$> GV.basicUnsafeFreeze v + {-# INLINE basicUnsafeThaw #-} basicUnsafeThaw (V_Direction v) = MV_Direction <$> GV.basicUnsafeThaw v + {-# INLINE basicLength #-} basicLength (V_Direction v) = GV.basicLength v + {-# INLINE basicUnsafeSlice #-} basicUnsafeSlice i n (V_Direction v) = V_Direction $ GV.basicUnsafeSlice i n v + {-# INLINE basicUnsafeIndexM #-} basicUnsafeIndexM (V_Direction v) i = toDirection <$> GV.basicUnsafeIndexM v i + basicUnsafeCopy (MV_Direction mv) (V_Direction v) = GV.basicUnsafeCopy mv v + {-# INLINE elemseq #-} elemseq _ = seq instance Show Direction where - show DiagArrow = "↖" show LeftArrow = "←" - show UpArrow = "↑" + show UpArrow = "↑" {-# INLINEABLE boldDirection #-} -boldDirection :: Char -> Char +boldDirection ∷ Char → Char boldDirection '↖' = '⇖' boldDirection '←' = '⇐' boldDirection '↑' = '⇑' -boldDirection d = d +boldDirection d = d + +{- | +Given the cost of deletion, alignment, and insertion (respectively), selects +the least costly direction. In the case of one or more equal costs, the +direction arrows are returned in the following descending order of priority: --- | --- Given the cost of deletion, alignment, and insertion (respectively), selects --- the least costly direction. In the case of one or more equal costs, the --- direction arrows are returned in the following descending order of priority: --- --- [ DiagArrow, LeftArrow, UpArrow ] --- + [ DiagArrow, LeftArrow, UpArrow ] +-} {-# INLINEABLE minimumCostDirection #-} -{-# SPECIALISE INLINE minimumCostDirection :: Int -> Int -> Int -> (# Int , Direction #) #-} -{-# SPECIALISE INLINE minimumCostDirection :: Int8 -> Int8 -> Int8 -> (# Int8 , Direction #) #-} -{-# SPECIALISE INLINE minimumCostDirection :: Int16 -> Int16 -> Int16 -> (# Int16 , Direction #) #-} -{-# SPECIALISE INLINE minimumCostDirection :: Int32 -> Int32 -> Int32 -> (# Int32 , Direction #) #-} -{-# SPECIALISE INLINE minimumCostDirection :: Int64 -> Int64 -> Int64 -> (# Int64 , Direction #) #-} -{-# SPECIALISE INLINE minimumCostDirection :: Word -> Word -> Word -> (# Word , Direction #) #-} -{-# SPECIALISE INLINE minimumCostDirection :: Word8 -> Word8 -> Word8 -> (# Word8 , Direction #) #-} -{-# SPECIALISE INLINE minimumCostDirection :: Word16 -> Word16 -> Word16 -> (# Word16, Direction #) #-} -{-# SPECIALISE INLINE minimumCostDirection :: Word32 -> Word32 -> Word32 -> (# Word32, Direction #) #-} -{-# SPECIALISE INLINE minimumCostDirection :: Word64 -> Word64 -> Word64 -> (# Word64, Direction #) #-} +{-# SPECIALIZE INLINE minimumCostDirection ∷ Int → Int → Int → (# Int, Direction #) #-} +{-# SPECIALIZE INLINE minimumCostDirection ∷ Int8 → Int8 → Int8 → (# Int8, Direction #) #-} +{-# SPECIALIZE INLINE minimumCostDirection ∷ Int16 → Int16 → Int16 → (# Int16, Direction #) #-} +{-# SPECIALIZE INLINE minimumCostDirection ∷ Int32 → Int32 → Int32 → (# Int32, Direction #) #-} +{-# SPECIALIZE INLINE minimumCostDirection ∷ Int64 → Int64 → Int64 → (# Int64, Direction #) #-} +{-# SPECIALIZE INLINE minimumCostDirection ∷ Word → Word → Word → (# Word, Direction #) #-} +{-# SPECIALIZE INLINE minimumCostDirection ∷ Word8 → Word8 → Word8 → (# Word8, Direction #) #-} +{-# SPECIALIZE INLINE minimumCostDirection ∷ Word16 → Word16 → Word16 → (# Word16, Direction #) #-} +{-# SPECIALIZE INLINE minimumCostDirection ∷ Word32 → Word32 → Word32 → (# Word32, Direction #) #-} +{-# SPECIALIZE INLINE minimumCostDirection ∷ Word64 → Word64 → Word64 → (# Word64, Direction #) #-} minimumCostDirection - :: Ord e - => e - -> e - -> e - -> (# e, Direction #) + ∷ (Ord e) + ⇒ e + → e + → e + → (# e, Direction #) minimumCostDirection delCost alnCost insCost - | alnCost <= delCost = if alnCost <= insCost - then (# alnCost, DiagArrow #) - else (# insCost, UpArrow #) - | delCost <= insCost = (# delCost, LeftArrow #) - | otherwise = (# insCost, UpArrow #) + | alnCost <= delCost = + if alnCost <= insCost + then (# alnCost, DiagArrow #) + else (# insCost, UpArrow #) + | delCost <= insCost = (# delCost, LeftArrow #) + | otherwise = (# insCost, UpArrow #) {-# INLINE fromDirection #-} -fromDirection :: Direction -> Word8 +fromDirection ∷ Direction → Word8 fromDirection DiagArrow = 0 fromDirection LeftArrow = 1 -fromDirection UpArrow = 2 +fromDirection UpArrow = 2 {-# INLINE toDirection #-} -toDirection :: Word8 -> Direction +toDirection ∷ Word8 → Direction toDirection 0 = DiagArrow toDirection 1 = LeftArrow toDirection _ = UpArrow diff --git a/lib/dynamic-character/src/DirectOptimization/Pairwise/Huge.hs b/lib/dynamic-character/src/DirectOptimization/Pairwise/Huge.hs index d369faebd..d6b219fcc 100644 --- a/lib/dynamic-character/src/DirectOptimization/Pairwise/Huge.hs +++ b/lib/dynamic-character/src/DirectOptimization/Pairwise/Huge.hs @@ -1,17 +1,17 @@ -module DirectOptimization.Pairwise.Huge - ( HugeDynamicCharacter - , HugeState - , hugePairwiseDO - ) where +module DirectOptimization.Pairwise.Huge ( + HugeDynamicCharacter, + HugeState, + hugePairwiseDO, +) where import Bio.DynamicCharacter import DirectOptimization.Pairwise.Ukkonen hugePairwiseDO - :: Word - -> (HugeState -> HugeState -> (HugeState, Word)) - -> HugeDynamicCharacter - -> HugeDynamicCharacter - -> (Word, HugeDynamicCharacter) + ∷ Word + → (HugeState → HugeState → (HugeState, Word)) + → HugeDynamicCharacter + → HugeDynamicCharacter + → (Word, HugeDynamicCharacter) hugePairwiseDO = ukkonenDO diff --git a/lib/dynamic-character/src/DirectOptimization/Pairwise/Internal.hs b/lib/dynamic-character/src/DirectOptimization/Pairwise/Internal.hs index f3dab0dec..1130b03a4 100644 --- a/lib/dynamic-character/src/DirectOptimization/Pairwise/Internal.hs +++ b/lib/dynamic-character/src/DirectOptimization/Pairwise/Internal.hs @@ -1,235 +1,294 @@ ------------------------------------------------------------------------------ --- | --- Module : DirectOptimization.Pairwise.Internal --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- --- Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. --- These functions will allocate an M * N matrix. --- ------------------------------------------------------------------------------ - -{-# LANGUAGE ApplicativeDo #-} +{-# LANGUAGE ApplicativeDo #-} {-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE Strict #-} +{-# LANGUAGE Strict #-} + +{- | +Module : DirectOptimization.Pairwise.Internal +Copyright : (c) 2015-2021 Ward Wheeler +License : BSD-style + +Maintainer : wheeler@amnh.org +Stability : provisional +Portability : portable + +Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. +These functions will allocate an M * N matrix. +-} +module DirectOptimization.Pairwise.Internal ( + -- * Alignment types + Direction (..), + TCMλ, -module DirectOptimization.Pairwise.Internal - ( -- * Alignment types - Direction(..) - , TCMλ -- * Alignment feneric functions - , directOptimization - , directOptimizationFromDirectionMatrix - ) where - -import Bio.DynamicCharacter -import Bio.DynamicCharacter.HandleGaps -import Bio.DynamicCharacter.Measure -import Control.Applicative -import Control.Monad (join) -import Control.Monad.Loops (whileM_) -import Data.Bits -import Data.Matrix.Class (Matrix, dim, unsafeIndex) -import qualified Data.Matrix.Unboxed as UM -import Data.STRef -import qualified Data.Vector as V -import Data.Vector.Generic (Vector, (!), (!?)) -import qualified Data.Vector.Generic as GV -import qualified Data.Vector.Storable as SV -import qualified Data.Vector.Unboxed as UV -import DirectOptimization.Pairwise.Direction - - --- | --- A generalized function representation: the "overlap" between dynamic character --- elements, supplying the corresponding median and cost to align the two --- characters. -type TCMλ e = e -> e -> (e, Word) + directOptimization, + directOptimizationFromDirectionMatrix, +) where + +import Bio.DynamicCharacter +import Bio.DynamicCharacter.Element (SlimState, WideState) +import Bio.DynamicCharacter.HandleGaps +import Bio.DynamicCharacter.Measure +import Control.Applicative +import Control.Monad (join) +import Control.Monad.Loops (whileM_) +import Data.Bits +import Data.Matrix.Class (Matrix, dim, unsafeIndex) +import Data.Matrix.Unboxed qualified as UM +import Data.STRef +import Data.Vector qualified as V +import Data.Vector.Generic (Vector, (!), (!?)) +import Data.Vector.Generic qualified as GV +import Data.Vector.Storable qualified as SV +import Data.Vector.Unboxed qualified as UV +import DirectOptimization.Pairwise.Direction + + +{- | +A generalized function representation: the "overlap" between dynamic character +elements, supplying the corresponding median and cost to align the two +characters. +-} +type TCMλ e = e → e → (e, Word) {-# SCC directOptimization #-} {-# INLINEABLE directOptimization #-} -{-# SPECIALISE directOptimization :: (SV.Vector SlimState -> SV.Vector SlimState -> (Word, SlimDynamicCharacter)) -> (SlimState -> SlimState -> (SlimState, Word)) -> SlimDynamicCharacter -> SlimDynamicCharacter -> (Word, SlimDynamicCharacter) #-} -{-# SPECIALISE directOptimization :: (UV.Vector WideState -> UV.Vector WideState -> (Word, WideDynamicCharacter)) -> (WideState -> WideState -> (WideState, Word)) -> WideDynamicCharacter -> WideDynamicCharacter -> (Word, WideDynamicCharacter) #-} -{-# SPECIALISE directOptimization :: ( V.Vector HugeState -> V.Vector HugeState -> (Word, HugeDynamicCharacter)) -> (HugeState -> HugeState -> (HugeState, Word)) -> HugeDynamicCharacter -> HugeDynamicCharacter -> (Word, HugeDynamicCharacter) #-} +{-# SPECIALIZE directOptimization ∷ + (SV.Vector SlimState → SV.Vector SlimState → (Word, SlimDynamicCharacter)) + → (SlimState → SlimState → (SlimState, Word)) + → SlimDynamicCharacter + → SlimDynamicCharacter + → (Word, SlimDynamicCharacter) + #-} +{-# SPECIALIZE directOptimization ∷ + (UV.Vector WideState → UV.Vector WideState → (Word, WideDynamicCharacter)) + → (WideState → WideState → (WideState, Word)) + → WideDynamicCharacter + → WideDynamicCharacter + → (Word, WideDynamicCharacter) + #-} +{-# SPECIALIZE directOptimization ∷ + (V.Vector HugeState → V.Vector HugeState → (Word, HugeDynamicCharacter)) + → (HugeState → HugeState → (HugeState, Word)) + → HugeDynamicCharacter + → HugeDynamicCharacter + → (Word, HugeDynamicCharacter) + #-} directOptimization - :: ( FiniteBits e - , Ord (v e) - , Vector v e - ) - => (v e -> v e -> (Word, OpenDynamicCharacter v e)) -- ^ Alignment function - -> TCMλ e -- ^ Metric for computing state distance and median state - -> OpenDynamicCharacter v e - -> OpenDynamicCharacter v e - -> (Word, OpenDynamicCharacter v e) + ∷ ( FiniteBits e + , Ord (v e) + , Vector v e + ) + ⇒ (v e → v e → (Word, OpenDynamicCharacter v e)) + -- ^ Alignment function + → TCMλ e + -- ^ Metric for computing state distance and median state + → OpenDynamicCharacter v e + → OpenDynamicCharacter v e + → (Word, OpenDynamicCharacter v e) directOptimization alignmentFunction overlapλ = handleMissing generateAlignmentResult - where - generateAlignmentResult lhs rhs = - let -- Build a 'gap' state now that we know that we can access a non-empty sequence. - gap = let tmp = extractMediansGapped rhs ! 0 in (tmp `xor` tmp) `setBit` 0 - -- Remove gaps from the inputs and measure the results to determine - -- which ungapped character is longer and which is shorter. - -- Always pass the shorter character into alignment functions first! - ~(swapped, gapsLesser, gapsLonger, lesser, longer) = measureCharactersWithoutGaps lhs rhs - lesserMeds = extractMediansGapped lesser - longerMeds = extractMediansGapped longer - ~(alignmentCost, ungappedAlignment) = - if GV.length lesserMeds == 0 - -- Neither character was Missing, but one or both are empty when gaps are removed - then alignmentWithAllGaps overlapλ longerMeds - -- Both have some non-gap elements, perform string alignment - else alignmentFunction lesserMeds longerMeds - regappedAlignment = insertGaps gap gapsLesser gapsLonger ungappedAlignment - transformation = if swapped then transposeCharacter else id - alignmentContext = transformation regappedAlignment - in (alignmentCost, forceDynamicCharacter alignmentContext) + where + generateAlignmentResult lhs rhs = + let -- Build a 'gap' state now that we know that we can access a non-empty sequence. + gap = let tmp = extractMediansGapped rhs ! 0 in (tmp `xor` tmp) `setBit` 0 + -- Remove gaps from the inputs and measure the results to determine + -- which ungapped character is longer and which is shorter. + -- Always pass the shorter character into alignment functions first! + ~(swapped, gapsLesser, gapsLonger, lesser, longer) = measureCharactersWithoutGaps lhs rhs + lesserMeds = extractMediansGapped lesser + longerMeds = extractMediansGapped longer + ~(alignmentCost, ungappedAlignment) = + if GV.length lesserMeds == 0 + then -- Neither character was Missing, but one or both are empty when gaps are removed + alignmentWithAllGaps overlapλ longerMeds + else -- Both have some non-gap elements, perform string alignment + alignmentFunction lesserMeds longerMeds + regappedAlignment = insertGaps gap gapsLesser gapsLonger ungappedAlignment + transformation = if swapped then transposeCharacter else id + alignmentContext = transformation regappedAlignment + in (alignmentCost, forceDynamicCharacter alignmentContext) {-# SCC directOptimizationFromDirectionMatrix #-} {-# INLINEABLE directOptimizationFromDirectionMatrix #-} -{-# SPECIALISE directOptimizationFromDirectionMatrix :: (WideState -> (WideState -> WideState -> (WideState, Word)) -> UV.Vector WideState -> UV.Vector WideState -> (Word, UM.Matrix Direction)) -> (WideState -> WideState -> (WideState, Word)) -> WideDynamicCharacter -> WideDynamicCharacter -> (Word, WideDynamicCharacter) #-} -{-# SPECIALISE directOptimizationFromDirectionMatrix :: (HugeState -> (HugeState -> HugeState -> (HugeState, Word)) -> V.Vector HugeState -> V.Vector HugeState -> (Word, UM.Matrix Direction)) -> (HugeState -> HugeState -> (HugeState, Word)) -> HugeDynamicCharacter -> HugeDynamicCharacter -> (Word, HugeDynamicCharacter) #-} +{-# SPECIALIZE directOptimizationFromDirectionMatrix ∷ + ( WideState + → (WideState → WideState → (WideState, Word)) + → UV.Vector WideState + → UV.Vector WideState + → (Word, UM.Matrix Direction) + ) + → (WideState → WideState → (WideState, Word)) + → WideDynamicCharacter + → WideDynamicCharacter + → (Word, WideDynamicCharacter) + #-} +{-# SPECIALIZE directOptimizationFromDirectionMatrix ∷ + ( HugeState + → (HugeState → HugeState → (HugeState, Word)) + → V.Vector HugeState + → V.Vector HugeState + → (Word, UM.Matrix Direction) + ) + → (HugeState → HugeState → (HugeState, Word)) + → HugeDynamicCharacter + → HugeDynamicCharacter + → (Word, HugeDynamicCharacter) + #-} directOptimizationFromDirectionMatrix - :: ( FiniteBits e - , Matrix m t Direction - , Ord (v e) - , Vector v e - ) - => (e -> TCMλ e -> v e -> v e -> (Word, m t Direction)) -- ^ Alignment matrix generator function - -> TCMλ e -- ^ Metric for computing state distance and median state - -> OpenDynamicCharacter v e - -> OpenDynamicCharacter v e - -> (Word, OpenDynamicCharacter v e) + ∷ ( FiniteBits e + , Matrix m t Direction + , Ord (v e) + , Vector v e + ) + ⇒ (e → TCMλ e → v e → v e → (Word, m t Direction)) + -- ^ Alignment matrix generator function + → TCMλ e + -- ^ Metric for computing state distance and median state + → OpenDynamicCharacter v e + → OpenDynamicCharacter v e + → (Word, OpenDynamicCharacter v e) directOptimizationFromDirectionMatrix matrixGenerator overlapλ = handleMissing $ directOptimization alignmentFunction overlapλ - where - alignmentFunction lhs rhs = - let gap = let tmp = rhs ! 0 in (tmp `xor` tmp) `setBit` 0 - (cost, traversalMatrix) = matrixGenerator gap overlapλ lhs rhs - in (cost, traceback gap overlapλ traversalMatrix lhs rhs) + where + alignmentFunction lhs rhs = + let gap = let tmp = rhs ! 0 in (tmp `xor` tmp) `setBit` 0 + (cost, traversalMatrix) = matrixGenerator gap overlapλ lhs rhs + in (cost, traceback gap overlapλ traversalMatrix lhs rhs) {-# SCC traceback #-} {-# INLINEABLE traceback #-} -{-# SPECIALISE traceback :: WideState -> (WideState -> WideState -> (WideState, Word)) -> UM.Matrix Direction -> UV.Vector WideState -> UV.Vector WideState -> WideDynamicCharacter #-} -{-# SPECIALISE traceback :: HugeState -> (HugeState -> HugeState -> (HugeState, Word)) -> UM.Matrix Direction -> V.Vector HugeState -> V.Vector HugeState -> HugeDynamicCharacter #-} +{-# SPECIALIZE traceback ∷ + WideState + → (WideState → WideState → (WideState, Word)) + → UM.Matrix Direction + → UV.Vector WideState + → UV.Vector WideState + → WideDynamicCharacter + #-} +{-# SPECIALIZE traceback ∷ + HugeState + → (HugeState → HugeState → (HugeState, Word)) + → UM.Matrix Direction + → V.Vector HugeState + → V.Vector HugeState + → HugeDynamicCharacter + #-} traceback - :: ( Bits e - , Matrix m t Direction - , Vector v e - ) - => e - -> TCMλ e - -> m t Direction - -> v e -- ^ Shorter dynamic character related to the "left column" - -> v e -- ^ Longer dynamic character related to the "top row" - -> OpenDynamicCharacter v e -- ^ Resulting dynamic character alignment context + ∷ ( Bits e + , Matrix m t Direction + , Vector v e + ) + ⇒ e + → TCMλ e + → m t Direction + → v e + -- ^ Shorter dynamic character related to the "left column" + → v e + -- ^ Longer dynamic character related to the "top row" + → OpenDynamicCharacter v e + -- ^ Resulting dynamic character alignment context traceback gap overlapλ directionMatrix lesser longer = forceDynamicCharacter alignment - where - f x y = fst $ overlapλ x y - getDirection = curry $ unsafeIndex directionMatrix - -- The maximum size the alignment could be - bufferLength = toEnum $ GV.length lesser + GV.length longer - - -- Construct the aligned dynamic character by using a buffer of it's maximum - -- possible length. Computet the length will performing the traceback. Then - -- after the traceback is performed, copy from the buffer to create a dynamic - -- character of the correct size. - -- - -- NOTE: The buffer creation, copying, and freeing are all handled by the - -- 'unsafeCharacterBuiltByBufferedST' function. - alignment = unsafeCharacterBuiltByBufferedST bufferLength $ \a -> do - let (m,n) = dim directionMatrix - iR <- newSTRef $ m - 1 - jR <- newSTRef $ n - 1 - kR <- newSTRef $ fromEnum bufferLength - - -- Set up convenience methods for accessing character elements - let getElement char ref = (char !) <$> (modifySTRef ref pred *> readSTRef ref) - let getLesserElement = getElement lesser iR - let getLongerElement = getElement longer jR - - -- Set up convenience methods for setting alignment elements on traceback - let setElementAt = modifySTRef kR pred *> readSTRef kR - let delete re = setElementAt >>= \k -> setDelete a k (f gap re) re - let align le re = setElementAt >>= \k -> setAlign a k le (f le re) re - let insert le = setElementAt >>= \k -> setInsert a k le (f le gap) - - -- Determine when to break the alignment loop - let continueAlignment = - let notAtOrigin i j = i /= 0 || j /= 0 - in liftA2 notAtOrigin (readSTRef iR) $ readSTRef jR - - -- Perform traceback - whileM_ continueAlignment $ do - arrow <- liftA2 getDirection (readSTRef iR) $ readSTRef jR - case arrow of - LeftArrow -> getLongerElement >>= delete - DiagArrow -> join $ liftA2 align getLesserElement getLongerElement - UpArrow -> getLesserElement >>= insert - - -- Return the actual alignment length - k <- readSTRef kR - pure $ bufferLength - toEnum k + where + f x y = fst $ overlapλ x y + getDirection = curry $ unsafeIndex directionMatrix + -- The maximum size the alignment could be + bufferLength = toEnum $ GV.length lesser + GV.length longer + + -- Construct the aligned dynamic character by using a buffer of it's maximum + -- possible length. Computet the length will performing the traceback. Then + -- after the traceback is performed, copy from the buffer to create a dynamic + -- character of the correct size. + -- + -- NOTE: The buffer creation, copying, and freeing are all handled by the + -- 'unsafeCharacterBuiltByBufferedST' function. + alignment = unsafeCharacterBuiltByBufferedST bufferLength $ \a → do + let (m, n) = dim directionMatrix + iR ← newSTRef $ m - 1 + jR ← newSTRef $ n - 1 + kR ← newSTRef $ fromEnum bufferLength + + -- Set up convenience methods for accessing character elements + let getElement char ref = (char !) <$> (modifySTRef ref pred *> readSTRef ref) + let getLesserElement = getElement lesser iR + let getLongerElement = getElement longer jR + + -- Set up convenience methods for setting alignment elements on traceback + let setElementAt = modifySTRef kR pred *> readSTRef kR + let delete re = setElementAt >>= \k → setDelete a k (f gap re) re + let align le re = setElementAt >>= \k → setAlign a k le (f le re) re + let insert le = setElementAt >>= \k → setInsert a k le (f le gap) + + -- Determine when to break the alignment loop + let continueAlignment = + let notAtOrigin i j = i /= 0 || j /= 0 + in liftA2 notAtOrigin (readSTRef iR) $ readSTRef jR + + -- Perform traceback + whileM_ continueAlignment $ do + arrow ← liftA2 getDirection (readSTRef iR) $ readSTRef jR + case arrow of + LeftArrow → getLongerElement >>= delete + DiagArrow → join $ liftA2 align getLesserElement getLongerElement + UpArrow → getLesserElement >>= insert + + -- Return the actual alignment length + k ← readSTRef kR + pure $ bufferLength - toEnum k {-# SCC alignmentWithAllGaps #-} {-# INLINEABLE alignmentWithAllGaps #-} alignmentWithAllGaps - :: ( Bits e - , Vector v e - ) - => TCMλ e - -> v e - -> (Word, OpenDynamicCharacter v e) + ∷ ( Bits e + , Vector v e + ) + ⇒ TCMλ e + → v e + → (Word, OpenDynamicCharacter v e) alignmentWithAllGaps overlapλ character = case character !? 0 of - -- Neither character was Missing, but both are empty when gaps are removed - Nothing -> (0, (GV.empty, GV.empty, GV.empty)) - -- Neither character was Missing, but one of them is empty when gaps are removed - Just e -> - let len = GV.length character - nil = e `xor` e - gap = nil`setBit` 0 - zed = GV.replicate len nil - med = GV.generate len $ fst . overlapλ gap . (character !) - in (0, (zed, med, character)) + -- Neither character was Missing, but both are empty when gaps are removed + Nothing → (0, (GV.empty, GV.empty, GV.empty)) + -- Neither character was Missing, but one of them is empty when gaps are removed + Just e → + let len = GV.length character + nil = e `xor` e + gap = nil `setBit` 0 + zed = GV.replicate len nil + med = GV.generate len $ fst . overlapλ gap . (character !) + in (0, (zed, med, character)) {-# SCC handleMissing #-} handleMissing - :: ( Bits e - , Vector v e - ) - => (OpenDynamicCharacter v e -> OpenDynamicCharacter v e -> (Word, OpenDynamicCharacter v e)) - -> OpenDynamicCharacter v e - -> OpenDynamicCharacter v e - -> (Word, OpenDynamicCharacter v e) + ∷ ( Bits e + , Vector v e + ) + ⇒ (OpenDynamicCharacter v e → OpenDynamicCharacter v e → (Word, OpenDynamicCharacter v e)) + → OpenDynamicCharacter v e + → OpenDynamicCharacter v e + → (Word, OpenDynamicCharacter v e) handleMissing f lhs rhs = - let implicitlyAlignWithMissing strIsLeft (_,med,_) = - let sym = med GV.! 0 - zed = sym `xor` sym - nil = GV.replicate (GV.length med) zed - str | strIsLeft = (med, med, nil) - | otherwise = (nil, med, med) - in (0, str) - in case (isMissing lhs, isMissing rhs) of - -- Case 1: *Both* children are missing - -- Without loss of generality, we return the "left" child, as both children are identical. - (True , True ) -> (0, lhs) - - -- Case 2: The *left* child is missing - -- Implicitly align the non-missing string with an equal length string of '?' missing symbols - (True , False) -> implicitlyAlignWithMissing False rhs - - -- Case 2: The *right* child is missing - -- Implicitly align the non-missing string with an equal length string of '?' missing symbols - (False, True ) -> implicitlyAlignWithMissing True lhs - - -- Case 4: *Niether* child is missing - -- Proceed with string alignment - (False, False) -> f lhs rhs + let implicitlyAlignWithMissing strIsLeft (_, med, _) = + let sym = med GV.! 0 + zed = sym `xor` sym + nil = GV.replicate (GV.length med) zed + str + | strIsLeft = (med, med, nil) + | otherwise = (nil, med, med) + in (0, str) + in case (isMissing lhs, isMissing rhs) of + -- Case 1: *Both* children are missing + -- Without loss of generality, we return the "left" child, as both children are identical. + (True, True) → (0, lhs) + -- Case 2: The *left* child is missing + -- Implicitly align the non-missing string with an equal length string of '?' missing symbols + (True, False) → implicitlyAlignWithMissing False rhs + -- Case 2: The *right* child is missing + -- Implicitly align the non-missing string with an equal length string of '?' missing symbols + (False, True) → implicitlyAlignWithMissing True lhs + -- Case 4: *Niether* child is missing + -- Proceed with string alignment + (False, False) → f lhs rhs diff --git a/lib/dynamic-character/src/DirectOptimization/Pairwise/Slim.hs b/lib/dynamic-character/src/DirectOptimization/Pairwise/Slim.hs index 3fe46a2f5..8c4f7cf6f 100644 --- a/lib/dynamic-character/src/DirectOptimization/Pairwise/Slim.hs +++ b/lib/dynamic-character/src/DirectOptimization/Pairwise/Slim.hs @@ -1,16 +1,17 @@ -module DirectOptimization.Pairwise.Slim - ( SlimDynamicCharacter - , SlimState - , slimPairwiseDO - ) where +module DirectOptimization.Pairwise.Slim ( + SlimDynamicCharacter, + SlimState, + slimPairwiseDO, +) where import Bio.DynamicCharacter +import Bio.DynamicCharacter.Element (SlimState) import DirectOptimization.Pairwise.Slim.FFI slimPairwiseDO - :: DenseTransitionCostMatrix - -> SlimDynamicCharacter - -> SlimDynamicCharacter - -> (Word, SlimDynamicCharacter) + ∷ DenseTransitionCostMatrix + → SlimDynamicCharacter + → SlimDynamicCharacter + → (Word, SlimDynamicCharacter) slimPairwiseDO = smallAlphabetPairwiseDO diff --git a/lib/dynamic-character/src/DirectOptimization/Pairwise/Slim/FFI.hsc b/lib/dynamic-character/src/DirectOptimization/Pairwise/Slim/FFI.hsc index 615b58cd6..1a7eaefbc 100644 --- a/lib/dynamic-character/src/DirectOptimization/Pairwise/Slim/FFI.hsc +++ b/lib/dynamic-character/src/DirectOptimization/Pairwise/Slim/FFI.hsc @@ -228,7 +228,7 @@ algn2d computeUnion computeMedians denseTCMs = directOptimization useC $ lookupP ics = coerce . (toEnum :: Int -> Word64) {-# SCC csi #-} csi :: CSize -> Int - csi = (fromEnum :: Word64 -> Int) . coerce + csi = (fromEnum :: Word64 -> Int) . coerce @@ -350,9 +350,17 @@ finalizeCharacterBuffer :: Int -> Int -> Vector SlimState -> Vector SlimState finalizeCharacterBuffer bufferLength alignedLength = let e = min bufferLength alignedLength off = bufferLength - e - in basicUnsafeSlice off e + in basicUnsafeSlice off e +-- | +-- Read and free the length of the resulting alignment. +getAlignedLength :: Ptr CSize -> IO Int +getAlignedLength lenRef = + let f = coerce :: CSize -> Word64 + in (fromEnum . f <$> peek lenRef) <* free lenRef + +{- -- | -- Allocates space for an align_io struct to be sent to C. {-# SCC allocCharacterBuffer #-} @@ -381,21 +389,6 @@ buildResult bufferLength alignedLength alignedBuffer = off = bufferLength - e ref = advancePtr alignedBuffer off in (V.fromListN e <$> peekArray e ref) <* free alignedBuffer - - -{- -buildResult :: Int -> Int -> Ptr SlimState -> IO (Vector SlimState) -buildResult bufferLength alignedLength alignedBuffer = do - let e = min bufferLength alignedLength - let off = bufferLength - e - let ref = advancePtr alignedBuffer off - (V.fromListN e <$> peekArray e ref) <* free alignedBuffer - vector <- mallocArray alignedLength - copyArray vector ref e - free alignedBuffer - fPtr <- newConcForeignPtr vector (free vector) - let res = V.unsafeFromForeignPtr0 fPtr e :: Vector CUInt - pure res -} @@ -410,4 +403,3 @@ buildResult bufferLength alignedLength alignedBuffer = do {-# SPECIALISE coerceEnum :: CUInt -> Word #-} coerceEnum :: (Enum a, Enum b) => a -> b coerceEnum = toEnum . fromEnum - diff --git a/lib/dynamic-character/src/DirectOptimization/Pairwise/Swapping.hs b/lib/dynamic-character/src/DirectOptimization/Pairwise/Swapping.hs index a02010311..35935c9f9 100644 --- a/lib/dynamic-character/src/DirectOptimization/Pairwise/Swapping.hs +++ b/lib/dynamic-character/src/DirectOptimization/Pairwise/Swapping.hs @@ -1,138 +1,161 @@ ------------------------------------------------------------------------------ --- | --- Module : DirectOptimization.Pairwise.Swapping --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- --- Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. --- These functions will allocate an M * N matrix. --- ------------------------------------------------------------------------------ - -{-# LANGUAGE ApplicativeDo #-} -{-# LANGUAGE ConstraintKinds #-} +{-# LANGUAGE ApplicativeDo #-} +{-# LANGUAGE ConstraintKinds #-} {-# LANGUAGE FlexibleContexts #-} -{-# LANGUAGE Strict #-} -{-# LANGUAGE TypeFamilies #-} -{-# LANGUAGE UnboxedTuples #-} +{-# LANGUAGE Strict #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE UnboxedTuples #-} + +{- | +Module : DirectOptimization.Pairwise.Swapping +Copyright : (c) 2015-2021 Ward Wheeler +License : BSD-style -module DirectOptimization.Pairwise.Swapping - ( Direction() - , swappingDO - , buildDirectionMatrix - , minimumCostDirection - ) where +Maintainer : wheeler@amnh.org +Stability : provisional +Portability : portable -import Bio.DynamicCharacter -import Control.Monad.ST -import Data.Bits -import Data.Foldable -import Data.Matrix.Unboxed (Matrix, unsafeFreeze) -import qualified Data.Matrix.Unboxed.Mutable as M -import qualified Data.Vector as V -import Data.Vector.Generic (Vector, (!)) -import qualified Data.Vector.Generic as GV -import qualified Data.Vector.Unboxed as UV -import qualified Data.Vector.Unboxed.Mutable as MUV -import DirectOptimization.Pairwise.Direction -import DirectOptimization.Pairwise.Internal +Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. +These functions will allocate an M * N matrix. +-} +module DirectOptimization.Pairwise.Swapping ( + Direction (), + swappingDO, + buildDirectionMatrix, + minimumCostDirection, +) where +import Bio.DynamicCharacter +import Bio.DynamicCharacter.Element (WideState) +import Control.Monad.ST +import Data.Bits +import Data.Foldable +import Data.Matrix.Unboxed (Matrix, unsafeFreeze) +import Data.Matrix.Unboxed.Mutable qualified as M +import Data.Vector qualified as V +import Data.Vector.Generic (Vector, (!)) +import Data.Vector.Generic qualified as GV +import Data.Vector.Unboxed qualified as UV +import Data.Vector.Unboxed.Mutable qualified as MUV +import DirectOptimization.Pairwise.Direction +import DirectOptimization.Pairwise.Internal --- | --- Performs a naive direct optimization. --- Takes in two characters to run DO on and an overlap function --- Returns an assignment character, the cost of that assignment, the assignment --- character with gaps included, the aligned version of the first input character, --- and the aligned version of the second input character. The process for this --- algorithm is to generate a traversal matrix, then perform a traceback. -{-# SCC swappingDO #-} + +{- | +Performs a naive direct optimization. +Takes in two characters to run DO on and an overlap function +Returns an assignment character, the cost of that assignment, the assignment +character with gaps included, the aligned version of the first input character, +and the aligned version of the second input character. The process for this +algorithm is to generate a traversal matrix, then perform a traceback. +-} +{-# SCC swappingDO #-} {-# INLINEABLE swappingDO #-} -{-# SPECIALISE swappingDO :: (WideState -> WideState -> (WideState, Word)) -> WideDynamicCharacter -> WideDynamicCharacter -> (Word, WideDynamicCharacter) #-} -{-# SPECIALISE swappingDO :: (HugeState -> HugeState -> (HugeState, Word)) -> HugeDynamicCharacter -> HugeDynamicCharacter -> (Word, HugeDynamicCharacter) #-} +{-# SPECIALIZE swappingDO ∷ + (WideState → WideState → (WideState, Word)) → WideDynamicCharacter → WideDynamicCharacter → (Word, WideDynamicCharacter) + #-} +{-# SPECIALIZE swappingDO ∷ + (HugeState → HugeState → (HugeState, Word)) → HugeDynamicCharacter → HugeDynamicCharacter → (Word, HugeDynamicCharacter) + #-} swappingDO - :: ( FiniteBits e - , Ord (v e) - , Vector v e - ) - => TCMλ e - -> OpenDynamicCharacter v e - -> OpenDynamicCharacter v e - -> (Word, OpenDynamicCharacter v e) + ∷ ( FiniteBits e + , Ord (v e) + , Vector v e + ) + ⇒ TCMλ e + → OpenDynamicCharacter v e + → OpenDynamicCharacter v e + → (Word, OpenDynamicCharacter v e) swappingDO = directOptimizationFromDirectionMatrix buildDirectionMatrix -{-# SCC buildDirectionMatrix #-} +{-# SCC buildDirectionMatrix #-} {-# INLINEABLE buildDirectionMatrix #-} -{-# SPECIALISE buildDirectionMatrix :: WideState -> (WideState -> WideState -> (WideState, Word)) -> UV.Vector WideState -> UV.Vector WideState -> (Word, Matrix Direction) #-} -{-# SPECIALISE buildDirectionMatrix :: HugeState -> (HugeState -> HugeState -> (HugeState, Word)) -> V.Vector HugeState -> V.Vector HugeState -> (Word, Matrix Direction) #-} +{-# SPECIALIZE buildDirectionMatrix ∷ + WideState + → (WideState → WideState → (WideState, Word)) + → UV.Vector WideState + → UV.Vector WideState + → (Word, Matrix Direction) + #-} +{-# SPECIALIZE buildDirectionMatrix ∷ + HugeState + → (HugeState → HugeState → (HugeState, Word)) + → V.Vector HugeState + → V.Vector HugeState + → (Word, Matrix Direction) + #-} buildDirectionMatrix - :: Vector v e - => e -- ^ Gap state - -> TCMλ e -- ^ Metric between states producing the medoid of states. - -> v e -- ^ Shorter dynamic character related to the "left column" - -> v e -- ^ Longer dynamic character related to the "top row" - -> (Word, Matrix Direction) + ∷ (Vector v e) + ⇒ e + -- ^ Gap state + → TCMλ e + -- ^ Metric between states producing the medoid of states. + → v e + -- ^ Shorter dynamic character related to the "left column" + → v e + -- ^ Longer dynamic character related to the "top row" + → (Word, Matrix Direction) buildDirectionMatrix gap tcmλ lesserLeft longerTop = fullMatrix - where - costλ x y = snd $ tcmλ x y - rows = GV.length lesserLeft + 1 - cols = GV.length longerTop + 1 - - fullMatrix = runST $ do - mDir <- M.new (rows, cols) - vOne <- MUV.new cols - vTwo <- MUV.new cols + where + costλ x y = snd $ tcmλ x y + rows = GV.length lesserLeft + 1 + cols = GV.length longerTop + 1 - let write v p@(~(_,j)) c d = MUV.unsafeWrite v j c *> M.unsafeWrite mDir p d + fullMatrix = runST $ do + mDir ← M.new (rows, cols) + vOne ← MUV.new cols + vTwo ← MUV.new cols - write vOne (0,0) 0 DiagArrow + let write v p@(~(_, j)) c d = MUV.unsafeWrite v j c *> M.unsafeWrite mDir p d - -- Special case the first row - -- We need to ensure that there are only Left Arrow values in the directional matrix. - -- We can also reduce the number of comparisons the first row makes from 3 to 1, - -- since the diagonal and upward values are "out of bounds." - for_ [1 .. cols - 1] $ \j -> - let topElement = longerTop ! (j - 1) - firstCellCost = costλ gap topElement - in do firstPrevCost <- MUV.unsafeRead vOne (j - 1) - write vOne (0,j) (firstCellCost + firstPrevCost) LeftArrow + write vOne (0, 0) 0 DiagArrow - for_ [1 .. rows - 1] $ \i -> - let (prev, curr) - | odd i = (vOne, vTwo) - | otherwise = (vTwo, vOne) - leftElement = lesserLeft ! (i - 1) - -- Special case the first cell of each row - -- We need to ensure that there are only Up Arrow values in the directional matrix. + -- Special case the first row + -- We need to ensure that there are only Left Arrow values in the directional matrix. -- We can also reduce the number of comparisons the first row makes from 3 to 1, - -- since the diagonal and leftward values are "out of bounds." - firstCellCost = costλ leftElement gap - in do firstPrevCost <- MUV.unsafeRead prev 0 - write curr (i,0) (firstCellCost + firstPrevCost) UpArrow - -- Finish special case for first cell of each row - -- Begin processing all other cells in the curr vector - for_ [1 .. cols - 1] $ \j -> - let topElement = longerTop ! (j - 1) - deleteCost = costλ gap topElement - alignCost = costλ leftElement topElement - insertCost = costλ leftElement gap - in do diagCost <- MUV.unsafeRead prev $ j - 1 - topCost <- MUV.unsafeRead prev j - leftCost <- MUV.unsafeRead curr $ j - 1 - let (# c, d #) = minimumCostDirection - (deleteCost + leftCost) - ( alignCost + diagCost) - (insertCost + topCost) - write curr (i,j) c d + -- since the diagonal and upward values are "out of bounds." + for_ [1 .. cols - 1] $ \j → + let topElement = longerTop ! (j - 1) + firstCellCost = costλ gap topElement + in do + firstPrevCost ← MUV.unsafeRead vOne (j - 1) + write vOne (0, j) (firstCellCost + firstPrevCost) LeftArrow - let v | odd rows = vOne - | otherwise = vTwo + for_ [1 .. rows - 1] $ \i → + let (prev, curr) + | odd i = (vOne, vTwo) + | otherwise = (vTwo, vOne) + leftElement = lesserLeft ! (i - 1) + -- Special case the first cell of each row + -- We need to ensure that there are only Up Arrow values in the directional matrix. + -- We can also reduce the number of comparisons the first row makes from 3 to 1, + -- since the diagonal and leftward values are "out of bounds." + firstCellCost = costλ leftElement gap + in do + firstPrevCost ← MUV.unsafeRead prev 0 + write curr (i, 0) (firstCellCost + firstPrevCost) UpArrow + -- Finish special case for first cell of each row + -- Begin processing all other cells in the curr vector + for_ [1 .. cols - 1] $ \j → + let topElement = longerTop ! (j - 1) + deleteCost = costλ gap topElement + alignCost = costλ leftElement topElement + insertCost = costλ leftElement gap + in do + diagCost ← MUV.unsafeRead prev $ j - 1 + topCost ← MUV.unsafeRead prev j + leftCost ← MUV.unsafeRead curr $ j - 1 + let (# c, d #) = + minimumCostDirection + (deleteCost + leftCost) + (alignCost + diagCost) + (insertCost + topCost) + write curr (i, j) c d - c <- MUV.unsafeRead v (cols - 1) - m <- unsafeFreeze mDir - pure (c, m) + let v + | odd rows = vOne + | otherwise = vTwo + c ← MUV.unsafeRead v (cols - 1) + m ← unsafeFreeze mDir + pure (c, m) diff --git a/lib/dynamic-character/src/DirectOptimization/Pairwise/Ukkonen.hs b/lib/dynamic-character/src/DirectOptimization/Pairwise/Ukkonen.hs index a702c7493..73959cb16 100644 --- a/lib/dynamic-character/src/DirectOptimization/Pairwise/Ukkonen.hs +++ b/lib/dynamic-character/src/DirectOptimization/Pairwise/Ukkonen.hs @@ -1,799 +1,878 @@ ------------------------------------------------------------------------------ --- | --- Module : DirectOptimization.Pairwise.Ukkonen --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- --- Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. --- These functions will allocate an M * N matrix. --- ------------------------------------------------------------------------------ - -{-# LANGUAGE ApplicativeDo #-} -{-# LANGUAGE BangPatterns #-} -{-# LANGUAGE ConstraintKinds #-} -{-# LANGUAGE DerivingStrategies #-} -{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE ApplicativeDo #-} +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE ConstraintKinds #-} +{-# LANGUAGE DerivingStrategies #-} +{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE ScopedTypeVariables #-} -{-# LANGUAGE TypeFamilies #-} -{-# LANGUAGE UnboxedTuples #-} - -module DirectOptimization.Pairwise.Ukkonen - ( Direction() - , ukkonenDO - , createUkkonenMethodMatrix - ) where - -import Bio.DynamicCharacter -import Bio.DynamicCharacter.Measure -import Control.Monad (unless, when) -import Control.Monad.Loops (iterateUntilM, whileM_) -import Control.Monad.Primitive -import Control.Monad.ST -import Data.Bits -import Data.Foldable -import Data.Matrix.Unboxed (Matrix, unsafeFreeze) -import Data.Matrix.Unboxed.Mutable (MMatrix) -import qualified Data.Matrix.Unboxed.Mutable as M -import Data.STRef -import qualified Data.Vector as V -import Data.Vector.Generic (Vector, (!)) -import qualified Data.Vector.Generic as GV -import qualified Data.Vector.Unboxed as UV -import DirectOptimization.Pairwise.Internal -import DirectOptimization.Pairwise.Swapping - - --- | --- Performs a naive direct optimization. --- Takes in two characters to run DO on and an overlap function --- Returns an assignment character, the cost of that assignment, the assignment --- character with gaps included, the aligned version of the first input character, --- and the aligned version of the second input character. The process for this --- algorithm is to generate a traversal matrix, then perform a traceback. -{-# SCC ukkonenDO #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE UnboxedTuples #-} + +{- | +Module : DirectOptimization.Pairwise.Ukkonen +Copyright : (c) 2015-2021 Ward Wheeler +License : BSD-style + +Maintainer : wheeler@amnh.org +Stability : provisional +Portability : portable + +Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. +These functions will allocate an M * N matrix. +-} +module DirectOptimization.Pairwise.Ukkonen ( + Direction (), + ukkonenDO, + createUkkonenMethodMatrix, +) where + +import Bio.DynamicCharacter +import Bio.DynamicCharacter.Element (WideState) +import Bio.DynamicCharacter.Measure +import Control.Monad (unless, when) +import Control.Monad.Loops (iterateUntilM, whileM_) +import Control.Monad.Primitive +import Control.Monad.ST +import Data.Bits +import Data.Foldable +import Data.Matrix.Unboxed (Matrix, unsafeFreeze) +import Data.Matrix.Unboxed.Mutable (MMatrix) +import Data.Matrix.Unboxed.Mutable qualified as M +import Data.STRef +import Data.Vector qualified as V +import Data.Vector.Generic (Vector, (!)) +import Data.Vector.Generic qualified as GV +import Data.Vector.Unboxed qualified as UV +import DirectOptimization.Pairwise.Internal +import DirectOptimization.Pairwise.Swapping + + +{- | +Performs a naive direct optimization. +Takes in two characters to run DO on and an overlap function +Returns an assignment character, the cost of that assignment, the assignment +character with gaps included, the aligned version of the first input character, +and the aligned version of the second input character. The process for this +algorithm is to generate a traversal matrix, then perform a traceback. +-} +{-# SCC ukkonenDO #-} {-# INLINEABLE ukkonenDO #-} -{-# SPECIALISE ukkonenDO :: Word -> (WideState -> WideState -> (WideState, Word)) -> WideDynamicCharacter -> WideDynamicCharacter -> (Word, WideDynamicCharacter) #-} -{-# SPECIALISE ukkonenDO :: Word -> (HugeState -> HugeState -> (HugeState, Word)) -> HugeDynamicCharacter -> HugeDynamicCharacter -> (Word, HugeDynamicCharacter) #-} +{-# SPECIALIZE ukkonenDO ∷ + Word + → (WideState → WideState → (WideState, Word)) + → WideDynamicCharacter + → WideDynamicCharacter + → (Word, WideDynamicCharacter) + #-} +{-# SPECIALIZE ukkonenDO ∷ + Word + → (HugeState → HugeState → (HugeState, Word)) + → HugeDynamicCharacter + → HugeDynamicCharacter + → (Word, HugeDynamicCharacter) + #-} ukkonenDO - :: ( FiniteBits e - , Ord (v e) - , Vector v e - ) - => Word -- ^ Coefficient value, representing the /minimum/ transition cost from a state to gap - -> TCMλ e -- ^ Metric between states producing the medoid of states. - -> OpenDynamicCharacter v e -- ^ /1st/ dynamic character - -> OpenDynamicCharacter v e -- ^ /2nd/ dynamic character - -> (Word, OpenDynamicCharacter v e) + ∷ ( FiniteBits e + , Ord (v e) + , Vector v e + ) + ⇒ Word + -- ^ Coefficient value, representing the /minimum/ transition cost from a state to gap + → TCMλ e + -- ^ Metric between states producing the medoid of states. + → OpenDynamicCharacter v e + -- ^ /1st/ dynamic character + → OpenDynamicCharacter v e + -- ^ /2nd/ dynamic character + → (Word, OpenDynamicCharacter v e) ukkonenDO coefficient tcmλ char1 char2 - | noGainFromUkkonenMethod = buildFullMatrix - | otherwise = buildBandMatrix - where - buildFullMatrix = swappingDO tcmλ char1 char2 - buildBandMatrix = directOptimizationFromDirectionMatrix ukkonenBandλ tcmλ char1 char2 - - ukkonenBandλ = createUkkonenMethodMatrix coefficient inputGapAmbiguities - - ~(_,_,_,x,y) = measureCharactersWithoutGaps char1 char2 - - lesser = extractMediansGapped x - longer = extractMediansGapped y - - -- /O(1)/ - -- - -- If the longer character is 50% larger than the shorter character, then - -- there is no point in using the barriers. Rather, we fill the full matrix - -- immediately. - -- - -- Additionally, if the shorter sequence is of length 4 or less, then the - -- initial barrier will be set adjacent to or beyond the lower left and - -- upper right corners. - -- - -- Also, a threshold coefficient is computed as the minimal indel cost from - -- any symbol in the alphabet to gap. However, if the indel cost for any - -- symbol is zero, the algorithm will hang, and a naive approach must be taken. - -- - -- Lastly, if the sum of the gaps in both strings is equal to or exceeds the - -- length of the longer string, then the threshold criteria will never be met - -- by definition. - -- - -- Do not perform Ukkonen's algorithm if and only if: - -- - -- > longerLen >= 1.5 * lesserLen - -- OR - -- > lesserLen <= 4 - -- OR - -- > coefficient == 0 - -- - noGainFromUkkonenMethod = lesserLen <= 4 - || 2 * longerLen >= 3 * lesserLen - || coefficient == 0 - || inputGapAmbiguities >= longerLen - where - longerLen = vLength longer - lesserLen = vLength lesser - - -- /O(n + m)/ - -- - -- NOTE: There will be no *unambiguous* gap elements in the dynamic characters! - -- However, there may be abiguous elements which contain gap as a possibility. - -- - -- If one or more of the character elements contained a gap, diagonal - -- directions in the matrix have an "indel" cost. 'gapsPresentInInputs' is - -- necessary in order to decrement the threshold value to account for this. - -- This was not described in Ukkonen's original paper, as the inputs were assumed - -- not to contain any gaps. - inputGapAmbiguities = char1Gaps + char2Gaps - where - char1Gaps = countGaps lesser - char2Gaps = countGaps longer - countGaps = vLength . GV.filter maybeGap - maybeGap = (`testBit` 0) -- Zero is the gap bit! - - vLength = toEnum . GV.length - - --- | --- /O( (n - m + 1 ) * log(n - m + 1) )/, /n/ >= /m/ --- --- Generates an /optimal/, partially-filled-in matrix using Ukkonen's string --- edit distance algorithm. --- --- Note that the threshold value is lowered more than described in Ukkonen's --- paper. This is to handle input elements that contain a gap. In Ukkonen's --- original description of the algorithm, there was a subtle assumption that --- input did not contain any gap symbols. -{-# SCC createUkkonenMethodMatrix #-} + | noGainFromUkkonenMethod = buildFullMatrix + | otherwise = buildBandMatrix + where + buildFullMatrix = swappingDO tcmλ char1 char2 + buildBandMatrix = directOptimizationFromDirectionMatrix ukkonenBandλ tcmλ char1 char2 + + ukkonenBandλ = createUkkonenMethodMatrix coefficient inputGapAmbiguities + + ~(_, _, _, x, y) = measureCharactersWithoutGaps char1 char2 + + lesser = extractMediansGapped x + longer = extractMediansGapped y + + -- /O(1)/ + -- + -- If the longer character is 50% larger than the shorter character, then + -- there is no point in using the barriers. Rather, we fill the full matrix + -- immediately. + -- + -- Additionally, if the shorter sequence is of length 4 or less, then the + -- initial barrier will be set adjacent to or beyond the lower left and + -- upper right corners. + -- + -- Also, a threshold coefficient is computed as the minimal indel cost from + -- any symbol in the alphabet to gap. However, if the indel cost for any + -- symbol is zero, the algorithm will hang, and a naive approach must be taken. + -- + -- Lastly, if the sum of the gaps in both strings is equal to or exceeds the + -- length of the longer string, then the threshold criteria will never be met + -- by definition. + -- + -- Do not perform Ukkonen's algorithm if and only if: + -- + -- > longerLen >= 1.5 * lesserLen + -- OR + -- > lesserLen <= 4 + -- OR + -- > coefficient == 0 + -- + noGainFromUkkonenMethod = + lesserLen <= 4 + || 2 * longerLen >= 3 * lesserLen + || coefficient == 0 + || inputGapAmbiguities >= longerLen + where + longerLen = vLength longer + lesserLen = vLength lesser + + -- /O(n + m)/ + -- + -- NOTE: There will be no *unambiguous* gap elements in the dynamic characters! + -- However, there may be abiguous elements which contain gap as a possibility. + -- + -- If one or more of the character elements contained a gap, diagonal + -- directions in the matrix have an "indel" cost. 'gapsPresentInInputs' is + -- necessary in order to decrement the threshold value to account for this. + -- This was not described in Ukkonen's original paper, as the inputs were assumed + -- not to contain any gaps. + inputGapAmbiguities = char1Gaps + char2Gaps + where + char1Gaps = countGaps lesser + char2Gaps = countGaps longer + countGaps = vLength . GV.filter maybeGap + maybeGap = (`testBit` 0) -- Zero is the gap bit! + vLength = toEnum . GV.length + + +{- | +/O( (n - m + 1 ) * log(n - m + 1) )/, /n/ >= /m/ + +Generates an /optimal/, partially-filled-in matrix using Ukkonen's string +edit distance algorithm. + +Note that the threshold value is lowered more than described in Ukkonen's +paper. This is to handle input elements that contain a gap. In Ukkonen's +original description of the algorithm, there was a subtle assumption that +input did not contain any gap symbols. +-} +{-# SCC createUkkonenMethodMatrix #-} {-# INLINEABLE createUkkonenMethodMatrix #-} -{-# SPECIALISE createUkkonenMethodMatrix :: Word -> Word -> WideState -> (WideState -> WideState -> (WideState, Word)) -> UV.Vector WideState -> UV.Vector WideState -> (Word, Matrix Direction) #-} -{-# SPECIALISE createUkkonenMethodMatrix :: Word -> Word -> HugeState -> (HugeState -> HugeState -> (HugeState, Word)) -> V.Vector HugeState -> V.Vector HugeState -> (Word, Matrix Direction) #-} +{-# SPECIALIZE createUkkonenMethodMatrix ∷ + Word + → Word + → WideState + → (WideState → WideState → (WideState, Word)) + → UV.Vector WideState + → UV.Vector WideState + → (Word, Matrix Direction) + #-} +{-# SPECIALIZE createUkkonenMethodMatrix ∷ + Word + → Word + → HugeState + → (HugeState → HugeState → (HugeState, Word)) + → V.Vector HugeState + → V.Vector HugeState + → (Word, Matrix Direction) + #-} createUkkonenMethodMatrix - :: Vector v e - => Word -- ^ Coefficient value, representing the /minimum/ transition cost from a state to gap - -> Word -- ^ Number of abiguous elements in both inputs which contained gap as a possible state - -> e -- ^ Gap State - -> TCMλ e -- ^ Metric between states producing the medoid of states. - -> v e -- ^ Shorter dynamic character - -> v e -- ^ Longer dynamic character - -> (Word, Matrix Direction) + ∷ (Vector v e) + ⇒ Word + -- ^ Coefficient value, representing the /minimum/ transition cost from a state to gap + → Word + -- ^ Number of abiguous elements in both inputs which contained gap as a possible state + → e + -- ^ Gap State + → TCMλ e + -- ^ Metric between states producing the medoid of states. + → v e + -- ^ Shorter dynamic character + → v e + -- ^ Longer dynamic character + → (Word, Matrix Direction) createUkkonenMethodMatrix minimumIndelCost inputGapAmbiguities gap tcmλ lesserLeft longerTop = finalMatrix - where - -- General values that need to be in scope for the recursive computations. - lesserLen = GV.length lesserLeft - longerLen = GV.length longerTop - - -- We start the offset at four rather than at one so that the first doubling - -- isn't trivially small. - startOffset = 2 - - -- /O(1)/ - -- - -- Necessary to compute the width of a row in the barrier-constrained matrix. - quasiDiagonalWidth = toEnum $ differenceInLength + 1 - where - differenceInLength = longerLen - lesserLen - - extra = (inputGapAmbiguities +) - - finalMatrix = runST $ do - (mCost, mDir) <- buildInitialBandedMatrix gap tcmλ lesserLeft longerTop $ extra startOffset - let getAlignmentCost = M.unsafeRead mCost (lesserLen, longerLen) - offsetRef <- newSTRef startOffset - - let needToResizeBand = do - offset <- readSTRef offsetRef - -- If the filled row width exceeds the actual row length, - -- Then clearly we are done as we have filled the entire matrix. - if quasiDiagonalWidth + extra offset > toEnum longerLen - then pure False - else let partialWidth = quasiDiagonalWidth + offset - -- Value that the alignment cost must be less than - threshold -- The threshold value must be non-negative - | partialWidth <= inputGapAmbiguities = 0 - | otherwise = minimumIndelCost * (partialWidth - inputGapAmbiguities) - in (threshold <=) <$> getAlignmentCost - - whileM_ needToResizeBand $ do - previousOffset <- readSTRef offsetRef - let currentOffset = previousOffset `shiftL` 1 -- Multiply by 2 - writeSTRef offsetRef currentOffset - expandBandedMatrix gap tcmλ lesserLeft longerTop mCost mDir - (extra previousOffset) - (extra currentOffset) - - c <- getAlignmentCost - m <- unsafeFreeze mDir - pure (c, m) + where + -- General values that need to be in scope for the recursive computations. + lesserLen = GV.length lesserLeft + longerLen = GV.length longerTop + + -- We start the offset at four rather than at one so that the first doubling + -- isn't trivially small. + startOffset = 2 + + -- /O(1)/ + -- + -- Necessary to compute the width of a row in the barrier-constrained matrix. + quasiDiagonalWidth = toEnum $ differenceInLength + 1 + where + differenceInLength = longerLen - lesserLen + + extra = (inputGapAmbiguities +) + + finalMatrix = runST $ do + (mCost, mDir) ← buildInitialBandedMatrix gap tcmλ lesserLeft longerTop $ extra startOffset + let getAlignmentCost = M.unsafeRead mCost (lesserLen, longerLen) + offsetRef ← newSTRef startOffset + + let needToResizeBand = do + offset ← readSTRef offsetRef + -- If the filled row width exceeds the actual row length, + -- Then clearly we are done as we have filled the entire matrix. + if quasiDiagonalWidth + extra offset > toEnum longerLen + then pure False + else + let partialWidth = quasiDiagonalWidth + offset + -- Value that the alignment cost must be less than + threshold -- The threshold value must be non-negative + | partialWidth <= inputGapAmbiguities = 0 + | otherwise = minimumIndelCost * (partialWidth - inputGapAmbiguities) + in (threshold <=) <$> getAlignmentCost + + whileM_ needToResizeBand $ do + previousOffset ← readSTRef offsetRef + let currentOffset = previousOffset `shiftL` 1 -- Multiply by 2 + writeSTRef offsetRef currentOffset + expandBandedMatrix + gap + tcmλ + lesserLeft + longerTop + mCost + mDir + (extra previousOffset) + (extra currentOffset) + + c ← getAlignmentCost + m ← unsafeFreeze mDir + pure (c, m) {-# SCC buildInitialBandedMatrix #-} buildInitialBandedMatrix - :: Vector v e - => e -- ^ Gap - -> TCMλ e -- ^ Metric between states producing the medoid of states. - -> v e -- ^ Shorter dynamic character - -> v e -- ^ Longer dynamic character - -> Word - -> ST s (MMatrix s Word, MMatrix s Direction) + ∷ (Vector v e) + ⇒ e + -- ^ Gap + → TCMλ e + -- ^ Metric between states producing the medoid of states. + → v e + -- ^ Shorter dynamic character + → v e + -- ^ Longer dynamic character + → Word + → ST s (MMatrix s Word, MMatrix s Direction) buildInitialBandedMatrix gap tcmλ lesserLeft longerTop o = fullMatrix - where - (offset, costλ, rows, cols, width, quasiDiagonalWidth) = ukkonenConstants tcmλ lesserLeft longerTop o - - fullMatrix = do - - --------------------------------------- - -- Allocate required space -- - --------------------------------------- - - mCost <- M.new (rows, cols) - mDir <- M.new (rows, cols) - - --------------------------------------- - -- Define some generalized functions -- - --------------------------------------- - let ~(readCost, write, internalCell, leftColumn, leftBoundary, rightBoundary, rightColumn) = - edgeCellDefinitions gap costλ longerTop mCost mDir - - -- Define how to compute values to an entire row of the Ukkonen matrix. - let writeRow i = - -- Precompute some values that will be used for the whole row - let start = max 0 $ i - offset - stop = min (cols - 1) $ i - offset + width - 1 - leftElement = lesserLeft ! (i - 1) - insertCost = costλ leftElement gap - - -- Each row in the matrix with values in the band has 'width' cells. - -- However, the band runs off the left end of the matrix for the first - -- several rows of the matrix. How many rows, though? - -- There are exactly 'offset' number of cells left of first matrix column - -- in the first row. The number of cells to the left of the matrix - -- decreases by one in each subsequent row. The means that the first - -- row, and then the next 'offset' number of rows require special handling - -- of the boundary. The last row index requiring special handling is index - -- 'offset'. Subsequent rows have the band begin at least one cell away - -- from the matrix boundary. - firstCell - | i <= offset = leftColumn - | otherwise = leftBoundary - - lastCell - | i <= cols - quasiDiagonalWidth - offset = rightBoundary - | otherwise = rightColumn - in do -- Write to the first cell of the Ukkonen band - firstCell leftElement insertCost i start - -- Write to the all the intermediary cells of the Ukkonen band - for_ [start + 1 .. stop - 1] $ \j -> - internalCell leftElement insertCost i j - -- Write to the last cell of the Ukkonen band - lastCell leftElement insertCost i stop - - --------------------------------------- - -- Compute all values of the matrix -- - --------------------------------------- - - -- Write to the origin to seed the first row. - write (0, 0) (# 0, DiagArrow #) - - -- Each row in the matrix with values in the band has 'width' cells. - -- However, the band runs off the end of the matrix for the first & last - -- rows of the matrix. We subtract the 'offest' from the 'width' because - -- there are exactly 'offset' number of cells left of first matrix column. - -- Hence the top row's width is 'width' minus 'offset'. The last cell index - -- in the top row of the band is 'width' minus 'offset' minus 1. - let topRowWidth = width - offset - let topRowWrite !j !cost = write (0,j) (# cost, LeftArrow #) - - -- Write the first row to seed subsequent rows. - for_ [1 .. min (cols - 1) (topRowWidth - 1)] $ \j -> - let topElement = longerTop ! (j - 1) - firstCellCost = costλ gap topElement - in do firstPrevCost <- readCost 0 $ j - 1 - topRowWrite j $ firstCellCost + firstPrevCost - - -- Loop through the remaining rows. - for_ [1 .. rows - 1] writeRow - - -- Return the matricies for possible expansion - pure (mCost, mDir) + where + (offset, costλ, rows, cols, width, quasiDiagonalWidth) = ukkonenConstants tcmλ lesserLeft longerTop o + + fullMatrix = do + --------------------------------------- + -- Allocate required space -- + --------------------------------------- + + mCost ← M.new (rows, cols) + mDir ← M.new (rows, cols) + + --------------------------------------- + -- Define some generalized functions -- + --------------------------------------- + let ~(readCost, write, internalCell, leftColumn, leftBoundary, rightBoundary, rightColumn) = + edgeCellDefinitions gap costλ longerTop mCost mDir + + -- Define how to compute values to an entire row of the Ukkonen matrix. + let writeRow i = + -- Precompute some values that will be used for the whole row + let start = max 0 $ i - offset + stop = min (cols - 1) $ i - offset + width - 1 + leftElement = lesserLeft ! (i - 1) + insertCost = costλ leftElement gap + + -- Each row in the matrix with values in the band has 'width' cells. + -- However, the band runs off the left end of the matrix for the first + -- several rows of the matrix. How many rows, though? + -- There are exactly 'offset' number of cells left of first matrix column + -- in the first row. The number of cells to the left of the matrix + -- decreases by one in each subsequent row. The means that the first + -- row, and then the next 'offset' number of rows require special handling + -- of the boundary. The last row index requiring special handling is index + -- 'offset'. Subsequent rows have the band begin at least one cell away + -- from the matrix boundary. + firstCell + | i <= offset = leftColumn + | otherwise = leftBoundary + + lastCell + | i <= cols - quasiDiagonalWidth - offset = rightBoundary + | otherwise = rightColumn + in do + -- Write to the first cell of the Ukkonen band + firstCell leftElement insertCost i start + -- Write to the all the intermediary cells of the Ukkonen band + for_ [start + 1 .. stop - 1] $ \j → + internalCell leftElement insertCost i j + -- Write to the last cell of the Ukkonen band + lastCell leftElement insertCost i stop + + --------------------------------------- + -- Compute all values of the matrix -- + --------------------------------------- + + -- Write to the origin to seed the first row. + write (0, 0) (# 0, DiagArrow #) + + -- Each row in the matrix with values in the band has 'width' cells. + -- However, the band runs off the end of the matrix for the first & last + -- rows of the matrix. We subtract the 'offest' from the 'width' because + -- there are exactly 'offset' number of cells left of first matrix column. + -- Hence the top row's width is 'width' minus 'offset'. The last cell index + -- in the top row of the band is 'width' minus 'offset' minus 1. + let topRowWidth = width - offset + let topRowWrite !j !cost = write (0, j) (# cost, LeftArrow #) + + -- Write the first row to seed subsequent rows. + for_ [1 .. min (cols - 1) (topRowWidth - 1)] $ \j → + let topElement = longerTop ! (j - 1) + firstCellCost = costλ gap topElement + in do + firstPrevCost ← readCost 0 $ j - 1 + topRowWrite j $ firstCellCost + firstPrevCost + + -- Loop through the remaining rows. + for_ [1 .. rows - 1] writeRow + + -- Return the matricies for possible expansion + pure (mCost, mDir) {-# SCC expandBandedMatrix #-} --- | --- Given a partially computed alignment matrix, --- will expand the computed region to the new specified offset. --- --- --- Dimensions: 13 ⨉ 17 --- ⊗ ┃ ⁎ α1 α2 α3 α4 α5 α6 α7 α8 α9 α0 α1 α2 α3 α4 α5 α6 --- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ --- ⁎ ┃ 0↖ 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← --- α1 ┃ 0↑ 0↖ 0↖ 0↖ 0← 0← 0← 0← 0← 0← 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0← --- α2 ┃ 0↑ 0↖ 0↖ 0↖ 0← 0← 0← 0← 0← 0← 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0← --- α3 ┃ 0↑ 0↑ 0↑ 0↖ 0↖ 0↖ 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0↖ --- α4 ┃ 0↑ 0↑ 0↑ 0↖ 0↖ 0← 0↖ 0← 0↖ 0← 0← 0← 0← 0← 0← 0← 0← --- α5 ┃ 0↑ 0↑ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0← 0← 0← 0← 0← 0← 0← 0← --- α6 ┃ 0↑ 0↖ 0↖ 0↖ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0← --- α7 ┃ 0↑ 0↑ 0↑ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0← 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ --- α8 ┃ 0↑ 0↑ 0↑ 0↑ 0↑ 0↖ 0← 0↖ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ --- α9 ┃ 0↑ 0↑ 0↑ 0↑ 0↖ 0↑ 0↖ 0← 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ --- α0 ┃ 0↑ 0↑ 0↑ 0↑ 0↑ 0↖ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ --- α1 ┃ 0↑ 0↑ 0↑ 0↑ 0↖ 0↑ 0↖ 0↖ 0↖ 0← 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ --- α2 ┃ 0↑ 0↖ 0↖ 0↖ 0↑ 0↑ 0↑ 0↖ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ --- --- ┌───────────────w───────────────┐ --- │ ┏━━━━━━━co━━━━━━━┪ --- ┢━━━━━qd━━━━━━┓┠─po─┐┌────Δo────┨ --- ⊗ ┃ ┃0 1 2 3 4┃┃5 6││7 8 9 10┃11 12 13 14 15 16 --- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ --- 0 ┃ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ --- 1 ┃ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ --- 2 ┃ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ --- 3 ┃ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ --- 4 ┃ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ --- 5 ┃ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ --- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ --- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ --- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ --- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ --- 0 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ --- 1 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ --- 2 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ --- --- --- w : Width --- qd : Quasi-diagonal --- co : Current Offset --- po : Previous Offset --- Δo : Difference in Offset --- --- Note: --- w = qd + co --- co = po + Δo --- --- And often: --- co = 2*po = 2*Δo --- --- ██ : The core band --- * Previously computed, sections may need to be recomputed --- --- ▓▓ : The previous extension --- * Previously computed, sections may need to be recomputed --- --- ▒▒ : The new extension --- * Needs to be computed --- -expandBandedMatrix - :: Vector v e - => e -- ^ Gap state - -> TCMλ e -- ^ Metric between states producing the medoid of states. - -> v e -- ^ Shorter dynamic character - -> v e -- ^ Longer dynamic character - -> MMatrix s Word - -> MMatrix s Direction - -> Word - -> Word - -> ST s () -expandBandedMatrix gap tcmλ lesserLeft longerTop mCost mDir po co = updatedBand - where - (offset, costλ, rows, cols, width, qd) = ukkonenConstants tcmλ lesserLeft longerTop co - prevOffset = fromEnum po - - updatedBand = do - - --------------------------------------- - -- Allocate mutable state variables -- - --------------------------------------- - - tailStart <- newSTRef cols - t0' <- newSTRef (-1) - t1' <- newSTRef $ qd + fromEnum po - --------------------------------------- - -- Define some generalized functions -- - --------------------------------------- - let ~(readCost, write, internalCell, leftColumn, leftBoundary, rightBoundary, rightColumn) = - edgeCellDefinitions gap costλ longerTop mCost mDir - - let computeCell !leftElement !insertCost !i !j = {-# SCC recomputeCell #-} - let !topElement = longerTop ! (j - 1) - !deleteCost = costλ gap topElement - !alignCost = costλ leftElement topElement - in do - diagCost <- readCost (i - 1) $ j - 1 - topCost <- readCost (i - 1) j - leftCost <- readCost i $ j - 1 - oldCost <- readCost i j - let !e@(# c, _ #) = minimumCostDirection +{- | +Given a partially computed alignment matrix, +will expand the computed region to the new specified offset. + + +Dimensions: 13 ⨉ 17 + ⊗ ┃ ⁎ α1 α2 α3 α4 α5 α6 α7 α8 α9 α0 α1 α2 α3 α4 α5 α6 +━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + ⁎ ┃ 0↖ 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← +α1 ┃ 0↑ 0↖ 0↖ 0↖ 0← 0← 0← 0← 0← 0← 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0← +α2 ┃ 0↑ 0↖ 0↖ 0↖ 0← 0← 0← 0← 0← 0← 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0← +α3 ┃ 0↑ 0↑ 0↑ 0↖ 0↖ 0↖ 0← 0← 0← 0← 0← 0← 0← 0← 0← 0← 0↖ +α4 ┃ 0↑ 0↑ 0↑ 0↖ 0↖ 0← 0↖ 0← 0↖ 0← 0← 0← 0← 0← 0← 0← 0← +α5 ┃ 0↑ 0↑ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0← 0← 0← 0← 0← 0← 0← 0← +α6 ┃ 0↑ 0↖ 0↖ 0↖ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0← +α7 ┃ 0↑ 0↑ 0↑ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0← 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ +α8 ┃ 0↑ 0↑ 0↑ 0↑ 0↑ 0↖ 0← 0↖ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ +α9 ┃ 0↑ 0↑ 0↑ 0↑ 0↖ 0↑ 0↖ 0← 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ +α0 ┃ 0↑ 0↑ 0↑ 0↑ 0↑ 0↖ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ +α1 ┃ 0↑ 0↑ 0↑ 0↑ 0↖ 0↑ 0↖ 0↖ 0↖ 0← 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ +α2 ┃ 0↑ 0↖ 0↖ 0↖ 0↑ 0↑ 0↑ 0↖ 0↑ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ 0↖ + + ┌───────────────w───────────────┐ + │ ┏━━━━━━━co━━━━━━━┪ + ┢━━━━━qd━━━━━━┓┠─po─┐┌────Δo────┨ + ⊗ ┃ ┃0 1 2 3 4┃┃5 6││7 8 9 10┃11 12 13 14 15 16 +━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + 0 ┃ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ + 1 ┃ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ + 2 ┃ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ + 3 ┃ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ + 4 ┃ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ + 5 ┃ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ + 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ ▒▒ + 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ ▒▒ + 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ ▒▒ + 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ ▒▒ + 0 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ ▓▓ + 1 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ ▓▓ + 2 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ▓▓ ▓▓ ██ ██ ██ ██ ██ + + +w : Width +qd : Quasi-diagonal +co : Current Offset +po : Previous Offset +Δo : Difference in Offset + +Note: +w = qd + co +co = po + Δo + +And often: +co = 2*po = 2*Δo + +██ : The core band + * Previously computed, sections may need to be recomputed + +▓▓ : The previous extension + * Previously computed, sections may need to be recomputed + +▒▒ : The new extension + * Needs to be computed +-} +expandBandedMatrix + ∷ (Vector v e) + ⇒ e + -- ^ Gap state + → TCMλ e + -- ^ Metric between states producing the medoid of states. + → v e + -- ^ Shorter dynamic character + → v e + -- ^ Longer dynamic character + → MMatrix s Word + → MMatrix s Direction + → Word + → Word + → ST s () +expandBandedMatrix gap tcmλ lesserLeft longerTop mCost mDir po co = updatedBand + where + (offset, costλ, rows, cols, width, qd) = ukkonenConstants tcmλ lesserLeft longerTop co + prevOffset = fromEnum po + + updatedBand = do + --------------------------------------- + -- Allocate mutable state variables -- + --------------------------------------- + + tailStart ← newSTRef cols + + t0' ← newSTRef (-1) + t1' ← newSTRef $ qd + fromEnum po + + --------------------------------------- + -- Define some generalized functions -- + --------------------------------------- + let ~(readCost, write, internalCell, leftColumn, leftBoundary, rightBoundary, rightColumn) = + edgeCellDefinitions gap costλ longerTop mCost mDir + + let computeCell !leftElement !insertCost !i !j = + {-# SCC recomputeCell #-} + let !topElement = longerTop ! (j - 1) + !deleteCost = costλ gap topElement + !alignCost = costλ leftElement topElement + in do + diagCost ← readCost (i - 1) $ j - 1 + topCost ← readCost (i - 1) j + leftCost ← readCost i $ j - 1 + oldCost ← readCost i j + let !e@(# c, _ #) = + minimumCostDirection (deleteCost + leftCost) - ( alignCost + diagCost) - (insertCost + topCost) - write (i,j) e - pure (c == oldCost, j+1) --- pure (c /= oldCost, j+1) - - let recomputeRange leftElement insertCost i x y = do - lastDiff <- newSTRef 0 - for_ [x .. y] $ \j -> do - (same, _) <- computeCell leftElement insertCost i j - unless same $ writeSTRef lastDiff j - readSTRef lastDiff - - -- Define how to compute values to an entire row of the Ukkonen matrix. - let extendRow i = - -- Precopmute some values that will be used for the whole row - let start0 = max 0 $ i - offset - start3 = min cols $ i + width - offset - prevOffset - 1 - goUpTo = max 0 ( i - prevOffset) - 1 - stop = min (cols - 1) $ i + width - offset - 1 - leftElement = lesserLeft ! (i - 1) - insertCost = costλ leftElement gap - firstCell - | i <= offset = leftColumn - | otherwise = leftBoundary - - lastCell - | i <= cols - qd - offset = rightBoundary - | otherwise = rightColumn - - b0 = start0 - e0 = goUpTo - b1 = start3 - e1 = stop - - continueRecomputing (same, j) = same || j >= stop - computeCell' ~(_,j) = computeCell leftElement insertCost i j - internalCell' j = internalCell leftElement insertCost i j - recomputeUntilSame j = snd <$> iterateUntilM continueRecomputing computeCell' (False, j) - in do -- First, we fill in 0 or more cells of the left region of - -- the expanded band. This is the region [b0, e0] computed - -- above. - -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 - -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- b0 e0 - -- ┏━━━━━━━━┓ - -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- - -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ - -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ - -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ - -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ - -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ - -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ - -- - - -- Conditionally write to the first cell of the Ukkonen band - if i > prevOffset - then firstCell leftElement insertCost i b0 - else pure () - - for_ [b0+1 .. e0] internalCell' - - -- Next, we assign to s0 the value t0 from the previous row. - -- The cell t0 is up to where the values were recomputed in - -- the previous row. - -- We recompute the cells in the range [e0 + 1, s0]. - -- We assign to t0 the last cell in the range [s1, s2] which - -- was updated for the next row. - -- We remember cells t0 for the next row. - -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 - -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- e0 s0 - -- ┏━━━━━┓ - -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- - -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ - -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ - -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ - -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ - -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ - -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ - -- - -- - s0 <- (\x -> min (x+1) e1) <$> readSTRef t0' - writeSTRef t0' (-1) - - when (s0 > e0 && toEnum i > po) $ - recomputeRange leftElement insertCost i (e0+1) s0 >>= writeSTRef t0' - t0 <- readSTRef t0' - - -- If s0 = t0, we recompute the cell (s0 + 1). - -- If the cost is the same, we stop here and remember the cell - -- before we stopped. - -- If the cost is not the same, we update cell (s0 + 1) and - -- move on to (s0 + 2). - -- This procedure continues until (s0 + n) has the same cost - -- as before, or *until we reach b1.* - -- We remember the cell (s0 + n - 1) as t0 for the next row. - -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 - -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- s0 t0 - -- ╔═════╗ - -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- - -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ - -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ - -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ - -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ - -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ - -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ - -- - if s0 == t0 && s0 > 0 - then recomputeUntilSame (s0 + 1) >>= writeSTRef t0' . pred - else if s0 <= e0 && e0 > 0 - then recomputeUntilSame (e0 + 1) >>= writeSTRef t0' . pred - else pure () - - -- Next, we assign to s1 the value t1 from the previous row. - -- We also assign s2 the value t2 from the previous row. - -- The range [t1, t2] is where the values were recomputed in - -- the previous row. - -- We recompute the cells in the range [s1, s2]. - -- We assign to t2 the last cell in the range [s1, s2] which - -- was updated for the next row. - -- We remember cells s1 as t1 for the next row. - -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 - -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- s1 s2 - -- ┏━━━━━┓ - -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- - -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ - -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ - -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ - -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ - -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ - -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ - -- - -- NOPE, Try again - -- - -- Next, we assign to s1 the value t1 from the previous row. - -- If s1 is less than t0, we assign to s1 the value t0 + 1. - -- This ensures that we do not start "behind" where we have - -- previously computed. - -- Then if s1 is greater than e1, we assign to s1 the - -- value e1. This ensures one cell is always written to. - -- We recompute the cells in the range [s1, b1 - 1]. - -- If any cell in the range was updated, we assign to s1 to t1. - -- We remember cell t1 for the next row. - -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 - -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- s1 b1 - -- ┏━━━━━━━━┓ - -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- - -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ - -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ - -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ - -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ - -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ - -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ - -- - s1 <- do a <- readSTRef t0' - b <- readSTRef t1' - pure . min e1 $ max a b - - t1 <- recomputeRange leftElement insertCost i s1 $ b1 - 1 - - -- If no cells were updated, a zero value is returned. - -- In this case, the "last" updated cell for the next row is b1. - writeSTRef t1' $ if t1 == 0 then b1 else s1 - - -- Lastly, we fill in 0 or more cells of the left region of - -- the expanded band. This is the region [b1, e1] computed - -- above. - -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 - -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ - -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- b1 e1 - -- ┏━━━━━━━━━━━┓ - -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- - -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ - -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ - -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ - -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ - -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ - -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ - -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ - -- - for_ [b1 .. e1 - 1] internalCell' - - -- Conditionally write to the last cell of the Ukkonen band - if i < rows - fromEnum po - then lastCell leftElement insertCost i stop - else pure () - - --------------------------------------- - -- Compute all values of the matrix -- - --------------------------------------- - - -- We start computation in the top row at the index equal to the - -- quasi-diagonal width plus the previous offset. This is because the last - -- cell of the top row which we computed was at the previous index since - -- we previous computed an number of cells from the quasi-diagonal equal to - -- the previous offset. - let topRowStart = qd + prevOffset - - -- Each row in the matrix with values in the band has 'width' cells. - -- However, the band runs off the end of the matrix for the first & last - -- rows of the matrix. We subtract the 'offset' from the 'width' because - -- there are exactly 'offset' number of cells left of first matrix column. - -- Hence the top row's width is 'width' minus 'offset'. The last cell index - -- in the top row of the band is 'width' minus 'offset' minus 1. - let topRowWidth = width - offset - - -- Of course, we must be cetrain that we don't extend past the last column - -- of the matrix. To prevent this, we take the minimum of the top row width - -- and the number of columns. So the last index we will compute in the top - -- row is the minimum of the two options minus one due to zero-indexing. - let topRowCease = pred $ min cols topRowWidth - - -- Write out the left arrow value on the top row. - let topRowWrite !j !cost = write (0,j) (# cost, LeftArrow #) - - -- Extend the first row to seed subsequent rows. - for_ [ topRowStart .. topRowCease ] $ \j -> - let !topElement = longerTop ! (j - 1) - !firstCellCost = costλ gap topElement - in do !firstPrevCost <- readCost 0 $ j - 1 - topRowWrite j $ firstCellCost + firstPrevCost - - writeSTRef tailStart topRowStart - - -- Loop through the remaining rows. - for_ [1 .. rows - 1] extendRow + (alignCost + diagCost) + (insertCost + topCost) + write (i, j) e + pure (c == oldCost, j + 1) + -- pure (c /= oldCost, j+1) + + let recomputeRange leftElement insertCost i x y = do + lastDiff ← newSTRef 0 + for_ [x .. y] $ \j → do + (same, _) ← computeCell leftElement insertCost i j + unless same $ writeSTRef lastDiff j + readSTRef lastDiff + + -- Define how to compute values to an entire row of the Ukkonen matrix. + let extendRow i = + -- Precopmute some values that will be used for the whole row + let start0 = max 0 $ i - offset + start3 = min cols $ i + width - offset - prevOffset - 1 + goUpTo = max 0 (i - prevOffset) - 1 + stop = min (cols - 1) $ i + width - offset - 1 + leftElement = lesserLeft ! (i - 1) + insertCost = costλ leftElement gap + firstCell + | i <= offset = leftColumn + | otherwise = leftBoundary + + lastCell + | i <= cols - qd - offset = rightBoundary + | otherwise = rightColumn + + b0 = start0 + e0 = goUpTo + b1 = start3 + e1 = stop + + continueRecomputing (same, j) = same || j >= stop + computeCell' ~(_, j) = computeCell leftElement insertCost i j + internalCell' j = internalCell leftElement insertCost i j + recomputeUntilSame j = snd <$> iterateUntilM continueRecomputing computeCell' (False, j) + in do + -- First, we fill in 0 or more cells of the left region of + -- the expanded band. This is the region [b0, e0] computed + -- above. + -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- b0 e0 + -- ┏━━━━━━━━┓ + -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- + -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ + -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ + -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ + -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ + -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ + -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ + -- + + -- Conditionally write to the first cell of the Ukkonen band + if i > prevOffset + then firstCell leftElement insertCost i b0 + else pure () + + for_ [b0 + 1 .. e0] internalCell' + + -- Next, we assign to s0 the value t0 from the previous row. + -- The cell t0 is up to where the values were recomputed in + -- the previous row. + -- We recompute the cells in the range [e0 + 1, s0]. + -- We assign to t0 the last cell in the range [s1, s2] which + -- was updated for the next row. + -- We remember cells t0 for the next row. + -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- e0 s0 + -- ┏━━━━━┓ + -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- + -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ + -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ + -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ + -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ + -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ + -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ + -- + -- + s0 ← (\x → min (x + 1) e1) <$> readSTRef t0' + writeSTRef t0' (-1) + + when (s0 > e0 && toEnum i > po) $ + recomputeRange leftElement insertCost i (e0 + 1) s0 >>= writeSTRef t0' + t0 ← readSTRef t0' + + -- If s0 = t0, we recompute the cell (s0 + 1). + -- If the cost is the same, we stop here and remember the cell + -- before we stopped. + -- If the cost is not the same, we update cell (s0 + 1) and + -- move on to (s0 + 2). + -- This procedure continues until (s0 + n) has the same cost + -- as before, or *until we reach b1.* + -- We remember the cell (s0 + n - 1) as t0 for the next row. + -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- s0 t0 + -- ╔═════╗ + -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- + -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ + -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ + -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ + -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ + -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ + -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ + -- + if s0 == t0 && s0 > 0 + then recomputeUntilSame (s0 + 1) >>= writeSTRef t0' . pred + else + if s0 <= e0 && e0 > 0 + then recomputeUntilSame (e0 + 1) >>= writeSTRef t0' . pred + else pure () + + -- Next, we assign to s1 the value t1 from the previous row. + -- We also assign s2 the value t2 from the previous row. + -- The range [t1, t2] is where the values were recomputed in + -- the previous row. + -- We recompute the cells in the range [s1, s2]. + -- We assign to t2 the last cell in the range [s1, s2] which + -- was updated for the next row. + -- We remember cells s1 as t1 for the next row. + -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- s1 s2 + -- ┏━━━━━┓ + -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- + -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ + -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ + -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ + -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ + -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ + -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ + -- + -- NOPE, Try again + -- + -- Next, we assign to s1 the value t1 from the previous row. + -- If s1 is less than t0, we assign to s1 the value t0 + 1. + -- This ensures that we do not start "behind" where we have + -- previously computed. + -- Then if s1 is greater than e1, we assign to s1 the + -- value e1. This ensures one cell is always written to. + -- We recompute the cells in the range [s1, b1 - 1]. + -- If any cell in the range was updated, we assign to s1 to t1. + -- We remember cell t1 for the next row. + -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- s1 b1 + -- ┏━━━━━━━━┓ + -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- + -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ + -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ + -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ + -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ + -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ + -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ + -- + s1 ← do + a ← readSTRef t0' + b ← readSTRef t1' + pure . min e1 $ max a b + + t1 ← recomputeRange leftElement insertCost i s1 $ b1 - 1 + + -- If no cells were updated, a zero value is returned. + -- In this case, the "last" updated cell for the next row is b1. + writeSTRef t1' $ if t1 == 0 then b1 else s1 + + -- Lastly, we fill in 0 or more cells of the left region of + -- the expanded band. This is the region [b1, e1] computed + -- above. + -- ⊗ ┃ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 + -- ━━━╋━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + -- 0 ┃ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 1 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 2 ┃ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 3 ┃ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 4 ┃ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- b1 e1 + -- ┏━━━━━━━━━━━┓ + -- 5 ┃ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- + -- 6 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ ▒▒ + -- 7 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ ▒▒ + -- 8 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ ▒▒ + -- 9 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ ▒▒ + -- 10 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ ██ + -- 11 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ ██ + -- 12 ┃ ▒▒ ▒▒ ▒▒ ▒▒ ██ ██ ██ ██ ██ ██ ██ + -- + for_ [b1 .. e1 - 1] internalCell' + + -- Conditionally write to the last cell of the Ukkonen band + if i < rows - fromEnum po + then lastCell leftElement insertCost i stop + else pure () + + --------------------------------------- + -- Compute all values of the matrix -- + --------------------------------------- + + -- We start computation in the top row at the index equal to the + -- quasi-diagonal width plus the previous offset. This is because the last + -- cell of the top row which we computed was at the previous index since + -- we previous computed an number of cells from the quasi-diagonal equal to + -- the previous offset. + let topRowStart = qd + prevOffset + + -- Each row in the matrix with values in the band has 'width' cells. + -- However, the band runs off the end of the matrix for the first & last + -- rows of the matrix. We subtract the 'offset' from the 'width' because + -- there are exactly 'offset' number of cells left of first matrix column. + -- Hence the top row's width is 'width' minus 'offset'. The last cell index + -- in the top row of the band is 'width' minus 'offset' minus 1. + let topRowWidth = width - offset + + -- Of course, we must be cetrain that we don't extend past the last column + -- of the matrix. To prevent this, we take the minimum of the top row width + -- and the number of columns. So the last index we will compute in the top + -- row is the minimum of the two options minus one due to zero-indexing. + let topRowCease = pred $ min cols topRowWidth + + -- Write out the left arrow value on the top row. + let topRowWrite !j !cost = write (0, j) (# cost, LeftArrow #) + + -- Extend the first row to seed subsequent rows. + for_ [topRowStart .. topRowCease] $ \j → + let !topElement = longerTop ! (j - 1) + !firstCellCost = costλ gap topElement + in do + !firstPrevCost ← readCost 0 $ j - 1 + topRowWrite j $ firstCellCost + firstPrevCost + + writeSTRef tailStart topRowStart + + -- Loop through the remaining rows. + for_ [1 .. rows - 1] extendRow edgeCellDefinitions - :: ( PrimMonad m - , Vector v e - ) - => e -- ^ Gap state - -> (e -> e -> Word) -- ^ Distance between states - -> v e -- ^ Longer dynamic character - -> MMatrix (PrimState m) Word - -> MMatrix (PrimState m) Direction - -> ( Int -> Int -> m Word - , (Int, Int) -> (# Word, Direction #) -> m () - , e -> Word -> Int -> Int -> m () - , e -> Word -> Int -> Int -> m () - , e -> Word -> Int -> Int -> m () - , e -> Word -> Int -> Int -> m () - , e -> Word -> Int -> Int -> m () - ) + ∷ ( PrimMonad m + , Vector v e + ) + ⇒ e + -- ^ Gap state + → (e → e → Word) + -- ^ Distance between states + → v e + -- ^ Longer dynamic character + → MMatrix (PrimState m) Word + → MMatrix (PrimState m) Direction + → ( Int → Int → m Word + , (Int, Int) → (# Word, Direction #) → m () + , e → Word → Int → Int → m () + , e → Word → Int → Int → m () + , e → Word → Int → Int → m () + , e → Word → Int → Int → m () + , e → Word → Int → Int → m () + ) edgeCellDefinitions gap costλ longerTop mCost mDir = (readCost, write, internalCell, leftColumn, leftBoundary, rightBoundary, rightColumn) - where - -- Read the cost of a cell - readCost = curry $ M.unsafeRead mCost - - -- Write to a single cell of the current vector and directional matrix simultaneously - write !p (# !c, !d #) = M.unsafeWrite mCost p c *> M.unsafeWrite mDir p d - - -- Write to an internal cell (not on a boundary) of the matrix. - internalCell !leftElement !insertCost !i !j = {-# SCC internalCell_expanding #-} - let !topElement = longerTop ! (j - 1) - !deleteCost = costλ gap topElement - !alignCost = costλ leftElement topElement - in do diagCost <- readCost (i - 1) $ j - 1 - topCost <- readCost (i - 1) j - leftCost <- readCost i $ j - 1 - let v = minimumCostDirection + where + -- Read the cost of a cell + readCost = curry $ M.unsafeRead mCost + + -- Write to a single cell of the current vector and directional matrix simultaneously + write !p (# !c, !d #) = M.unsafeWrite mCost p c *> M.unsafeWrite mDir p d + + -- Write to an internal cell (not on a boundary) of the matrix. + internalCell !leftElement !insertCost !i !j = + {-# SCC internalCell_expanding #-} + let !topElement = longerTop ! (j - 1) + !deleteCost = costλ gap topElement + !alignCost = costλ leftElement topElement + in do + diagCost ← readCost (i - 1) $ j - 1 + topCost ← readCost (i - 1) j + leftCost ← readCost i $ j - 1 + let v = + minimumCostDirection + (deleteCost + leftCost) + (alignCost + diagCost) + (insertCost + topCost) + write (i, j) v + + -- Define how to compute the first cell of the first "offset" rows. + -- We need to ensure that there are only Up Arrow values in the directional matrix. + -- We can also reduce the number of comparisons the first row makes from 3 to 1, + -- since the diagonal and leftward values are "out of bounds." + leftColumn _leftElement !insertCost !i !j = + {-# SCC leftColumn #-} + do + firstPrevCost ← readCost (i - 1) j + write (i, j) (# insertCost + firstPrevCost, UpArrow #) + + -- Define how to compute the first cell of the remaining rows. + -- We need to ensure that there are no Left Arrow values in the directional matrix. + -- We can also reduce the number of comparisons the first row makes from 3 to 2, + -- since the leftward values are "out of bounds." + -- Define how to compute the first cell of the remaining rows. + -- We need to ensure that there are no Left Arrow values in the directional matrix. + -- We can also reduce the number of comparisons the first row makes from 3 to 2, + -- since the leftward values are "out of bounds." + leftBoundary !leftElement !insertCost !i !j = + {-# SCC leftBoundary #-} + let topElement = longerTop ! (j - 1) + alignCost = costλ leftElement topElement + in do + diagCost ← readCost (i - 1) $ j - 1 + topCost ← readCost (i - 1) j + let v = + minimumCostDirection + maxBound + (alignCost + diagCost) + (insertCost + topCost) + write (i, j) v + + -- Define how to compute the last cell of the first "rows - offset" rows. + -- We need to ensure that there are only Left Arrow values in the directional matrix. + -- We can also reduce the number of comparisons the first row makes from 3 to 1, + -- since the diagonal and upward values are "out of bounds." + rightBoundary !leftElement _insertCost !i !j = + {-# SCC rightBoundary #-} + let topElement = longerTop ! (j - 1) + deleteCost = costλ gap topElement + alignCost = costλ leftElement topElement + in do + diagCost ← readCost (i - 1) $ j - 1 + leftCost ← readCost i $ j - 1 + let v = + minimumCostDirection (deleteCost + leftCost) - ( alignCost + diagCost) - (insertCost + topCost) - write (i,j) v - - - -- Define how to compute the first cell of the first "offset" rows. - -- We need to ensure that there are only Up Arrow values in the directional matrix. - -- We can also reduce the number of comparisons the first row makes from 3 to 1, - -- since the diagonal and leftward values are "out of bounds." - leftColumn _leftElement !insertCost !i !j = {-# SCC leftColumn #-} do - firstPrevCost <- readCost (i - 1) j - write (i,j) (# insertCost + firstPrevCost, UpArrow #) - - -- Define how to compute the first cell of the remaining rows. - -- We need to ensure that there are no Left Arrow values in the directional matrix. - -- We can also reduce the number of comparisons the first row makes from 3 to 2, - -- since the leftward values are "out of bounds." - -- Define how to compute the first cell of the remaining rows. - -- We need to ensure that there are no Left Arrow values in the directional matrix. - -- We can also reduce the number of comparisons the first row makes from 3 to 2, - -- since the leftward values are "out of bounds." - leftBoundary !leftElement !insertCost !i !j = {-# SCC leftBoundary #-} - let topElement = longerTop ! (j - 1) - alignCost = costλ leftElement topElement - in do diagCost <- readCost (i - 1) $ j - 1 - topCost <- readCost (i - 1) j - let v = minimumCostDirection + (alignCost + diagCost) maxBound - ( alignCost + diagCost) - (insertCost + topCost) - write (i,j) v - - -- Define how to compute the last cell of the first "rows - offset" rows. - -- We need to ensure that there are only Left Arrow values in the directional matrix. - -- We can also reduce the number of comparisons the first row makes from 3 to 1, - -- since the diagonal and upward values are "out of bounds." - rightBoundary !leftElement _insertCost !i !j = {-# SCC rightBoundary #-} - let topElement = longerTop ! (j - 1) - deleteCost = costλ gap topElement - alignCost = costλ leftElement topElement - in do diagCost <- readCost (i - 1) $ j - 1 - leftCost <- readCost i $ j - 1 - let v = minimumCostDirection - (deleteCost + leftCost) - ( alignCost + diagCost) - maxBound - write (i,j) v - - rightColumn = {-# SCC rightColumn #-} internalCell - - --- | --- Produces a set of reusable values and functions which are "constant" between --- different incarnations of the Ukkonen algorithms. + write (i, j) v + + rightColumn = {-# SCC rightColumn #-} internalCell + + +{- | +Produces a set of reusable values and functions which are "constant" between +different incarnations of the Ukkonen algorithms. +-} ukkonenConstants - :: Vector v e - => TCMλ e -- ^ Metric between states producing the medoid of states. - -> v e -- ^ Shorter dynamic character - -> v e -- ^ Longer dynamic character - -> Word -- ^ Current offset from quasi-diagonal - -> (Int, e -> e -> Word, Int, Int, Int, Int) + ∷ (Vector v e) + ⇒ TCMλ e + -- ^ Metric between states producing the medoid of states. + → v e + -- ^ Shorter dynamic character + → v e + -- ^ Longer dynamic character + → Word + -- ^ Current offset from quasi-diagonal + → (Int, e → e → Word, Int, Int, Int, Int) ukkonenConstants tcmλ lesserLeft longerTop co = (offset, costλ, rows, cols, width, quasiDiagonalWidth) - where - offset = clampOffset co - costλ x = snd . tcmλ x - longerLen = GV.length longerTop - lesserLen = GV.length lesserLeft - rows = GV.length lesserLeft + 1 - cols = GV.length longerTop + 1 - width = quasiDiagonalWidth + (offset `shiftL` 1) -- Multiply by 2 - quasiDiagonalWidth = differenceInLength + 1 - where - differenceInLength = longerLen - lesserLen - - -- Note: "offset" cannot cause "width + quasiDiagonalWidth" to exceed "2 * cols" - clampOffset o = - let o' = fromEnum o in min o' $ cols - quasiDiagonalWidth + where + offset = clampOffset co + costλ x = snd . tcmλ x + longerLen = GV.length longerTop + lesserLen = GV.length lesserLeft + rows = GV.length lesserLeft + 1 + cols = GV.length longerTop + 1 + width = quasiDiagonalWidth + (offset `shiftL` 1) -- Multiply by 2 + quasiDiagonalWidth = differenceInLength + 1 + where + differenceInLength = longerLen - lesserLen + + -- Note: "offset" cannot cause "width + quasiDiagonalWidth" to exceed "2 * cols" + clampOffset o = + let o' = fromEnum o in min o' $ cols - quasiDiagonalWidth diff --git a/lib/dynamic-character/src/DirectOptimization/Pairwise/Visualization.hs b/lib/dynamic-character/src/DirectOptimization/Pairwise/Visualization.hs index d22dccf97..084b3a77c 100644 --- a/lib/dynamic-character/src/DirectOptimization/Pairwise/Visualization.hs +++ b/lib/dynamic-character/src/DirectOptimization/Pairwise/Visualization.hs @@ -1,61 +1,61 @@ ----------------------------------------------------------------------------- --- | --- Module : DirectOptimization.Pairwise.Visualization --- Copyright : (c) 2015-2021 Ward Wheeler --- License : BSD-style --- --- Maintainer : wheeler@amnh.org --- Stability : provisional --- Portability : portable --- --- Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. --- These functions will allocate an M * N matrix. --- ----------------------------------------------------------------------------- - -{-# LANGUAGE ApplicativeDo #-} -{-# LANGUAGE ConstraintKinds #-} -{-# LANGUAGE DerivingStrategies #-} -{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE ApplicativeDo #-} +{-# LANGUAGE ConstraintKinds #-} +{-# LANGUAGE DerivingStrategies #-} +{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE MultiParamTypeClasses #-} -{-# LANGUAGE Strict #-} -{-# LANGUAGE TypeFamilies #-} -{-# LANGUAGE UnboxedTuples #-} - +{-# LANGUAGE Strict #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE UnboxedTuples #-} {-# OPTIONS_GHC -Wno-incomplete-patterns #-} {-# OPTIONS_GHC -Wno-incomplete-uni-patterns #-} -module DirectOptimization.Pairwise.Visualization - ( Direction() +{- | +Module : DirectOptimization.Pairwise.Visualization +Copyright : (c) 2015-2021 Ward Wheeler +License : BSD-style + +Maintainer : wheeler@amnh.org +Stability : provisional +Portability : portable + +Direct optimization pairwise alignment using the Needleman-Wunsch algorithm. +These functions will allocate an M * N matrix. +-} +module DirectOptimization.Pairwise.Visualization ( + Direction (), + -- * Operational - , directOptimizationDiffDirectionMatricies + directOptimizationDiffDirectionMatricies, + -- * Rendering - , renderMatrix - , renderDirectionMatrix - , diffDirectionMatrix - ) where + renderMatrix, + renderDirectionMatrix, + diffDirectionMatrix, +) where import Bio.DynamicCharacter import Bio.DynamicCharacter.Measure import Data.Bits -import Data.Foldable (fold) -import Data.Matrix.Class (Matrix, dim, toLists, unsafeIndex) -import Data.Set (Set, fromDistinctAscList, member) -import Data.Vector.Generic (Vector, basicLength, toList) +import Data.Foldable (fold) +import Data.Matrix.Class (Matrix, dim, toLists, unsafeIndex) +import Data.Set (Set, fromDistinctAscList, member) +import Data.Vector.Generic (Vector, basicLength, toList) import DirectOptimization.Pairwise.Direction directOptimizationDiffDirectionMatricies - :: ( FiniteBits e - , Matrix m t Direction - , Ord (v e) - , Vector v e - ) - => (v e -> v e -> (Word, m t Direction)) - -> (v e -> v e -> (Word, m t Direction)) - -> OpenDynamicCharacter v e - -> OpenDynamicCharacter v e - -> String + ∷ ( FiniteBits e + , Matrix m t Direction + , Ord (v e) + , Vector v e + ) + ⇒ (v e → v e → (Word, m t Direction)) + → (v e → v e → (Word, m t Direction)) + → OpenDynamicCharacter v e + → OpenDynamicCharacter v e + → String directOptimizationDiffDirectionMatricies matrixGenerator1 matrixGenerator2 lhs rhs = let -- Remove gaps from the inputs and measure the results to determine -- which ungapped character is longer and which is shorter. @@ -64,296 +64,319 @@ directOptimizationDiffDirectionMatricies matrixGenerator1 matrixGenerator2 lhs r lesserMeds = extractMediansGapped lesser longerMeds = extractMediansGapped longer in case basicLength lesserMeds of - -- Neither character was Missing, but one or both are empty when gaps are removed - 0 -> "One character was all gaps" - -- Both have some non-gap elements, perform string alignment - _ -> let dm1 = snd $ matrixGenerator1 lesserMeds longerMeds + -- Neither character was Missing, but one or both are empty when gaps are removed + 0 → "One character was all gaps" + -- Both have some non-gap elements, perform string alignment + _ → + let dm1 = snd $ matrixGenerator1 lesserMeds longerMeds dm2 = snd $ matrixGenerator2 lesserMeds longerMeds in diffDirectionMatrix lesserMeds longerMeds dm1 dm2 --- | --- Serializes an alignment matrix to a 'String'. Uses input characters for row --- and column labelings. --- --- Useful for debugging purposes. -renderMatrix - :: ( Matrix m t x - , Vector v e - , Show x - ) - => v e -- ^ Shorter vector of elements - -> v e -- ^ Longer vector of elements - -> m t x -- ^ Matrix of cells - -> String -renderMatrix lesser longer mtx = unlines - [ dimensionPrefix - , headerRow - , barRow - , renderedRows - ] - where - (colCount, rowCount, lesserTokens, longerTokens, maxPrefixWidth, maxColumnWidth, [matrixTokens]) = - getMatrixConstants lesser longer [mtx] -{- - toShownIntegers = fmap (show . showBitsValue) . otoList - - showBitsValue :: FiniteBits b => b -> Word - showBitsValue b = go (finiteBitSize b) 0 - where - go 0 v = v - go i v = let i' = i-1 - v' | b `testBit` i' = v + bit i' - | otherwise = v - in go i' v' --} - - dimensionPrefix = " " <> unwords - [ "Dimensions:" - , show rowCount - , "X" - , show colCount - ] +{- | +Serializes an alignment matrix to a 'String'. Uses input characters for row +and column labelings. - headerRow = fold - [ " " - , pad maxPrefixWidth "\\" - , "| " - , pad maxColumnWidth "*" - , concatMap (pad maxColumnWidth) longerTokens - ] - - barRow = fold - [ " " - , bar maxPrefixWidth - , "+" - , concatMap (const (bar maxColumnWidth)) $ undefined : longerTokens +Useful for debugging purposes. +-} +renderMatrix + ∷ ( Matrix m t x + , Vector v e + , Show x + ) + ⇒ v e + -- ^ Shorter vector of elements + → v e + -- ^ Longer vector of elements + → m t x + -- ^ Matrix of cells + → String +renderMatrix lesser longer mtx = + unlines + [ dimensionPrefix + , headerRow + , barRow + , renderedRows ] - where - bar n = replicate (n+1) '-' - - renderedRows = unlines $ zipWith renderRow ("*":lesserTokens) matrixTokens - where - renderRow e cs = prefix <> suffix - where - prefix = fold [" ", pad maxPrefixWidth e, "| "] - suffix = concatMap (pad maxColumnWidth) cs - - pad :: Int -> String -> String - pad n e = replicate (n - len) ' ' <> e <> " " - where - len = length e - - --- | --- Serializes an alignment matrix to a 'String'. Uses input characters for row --- and column labelings. --- --- Useful for debugging purposes. + where + (colCount, rowCount, lesserTokens, longerTokens, maxPrefixWidth, maxColumnWidth, [matrixTokens]) = + getMatrixConstants lesser longer [mtx] + {- + toShownIntegers = fmap (show . showBitsValue) . otoList + + showBitsValue :: FiniteBits b => b -> Word + showBitsValue b = go (finiteBitSize b) 0 + where + go 0 v = v + go i v = let i' = i-1 + v' | b `testBit` i' = v + bit i' + | otherwise = v + in go i' v' + -} + + dimensionPrefix = + " " + <> unwords + [ "Dimensions:" + , show rowCount + , "X" + , show colCount + ] + + headerRow = + fold + [ " " + , pad maxPrefixWidth "\\" + , "| " + , pad maxColumnWidth "*" + , concatMap (pad maxColumnWidth) longerTokens + ] + + barRow = + fold + [ " " + , bar maxPrefixWidth + , "+" + , concatMap (const (bar maxColumnWidth)) $ undefined : longerTokens + ] + where + bar n = replicate (n + 1) '-' + + renderedRows = unlines $ zipWith renderRow ("*" : lesserTokens) matrixTokens + where + renderRow e cs = prefix <> suffix + where + prefix = fold [" ", pad maxPrefixWidth e, "| "] + suffix = concatMap (pad maxColumnWidth) cs + + pad ∷ Int → String → String + pad n e = replicate (n - len) ' ' <> e <> " " + where + len = length e + + +{- | +Serializes an alignment matrix to a 'String'. Uses input characters for row +and column labelings. + +Useful for debugging purposes. +-} renderDirectionMatrix - :: ( Matrix m t Direction - , Vector v e - ) - => v e - -> v e - -> m t Direction - -> String -renderDirectionMatrix lesser longer mtx = unlines - [ dimensionPrefix - , headerRow - , barRow - , renderedRows - ] - where - (colCount, rowCount, lesserTokens, longerTokens, maxPrefixWidth, maxColumnWidth, [matrixTokens]) = - getMatrixConstants lesser longer [mtx] - - tracebackCells = getTracebackIndices mtx - - dimensionPrefix = " " <> unwords - [ "Dimensions:" - , show rowCount - , "X" - , show colCount - ] - - headerRow = fold - [ " " - , pad maxPrefixWidth "\\" - , "| " - , pad maxColumnWidth "*" - , concatMap (pad maxColumnWidth) longerTokens + ∷ ( Matrix m t Direction + , Vector v e + ) + ⇒ v e + → v e + → m t Direction + → String +renderDirectionMatrix lesser longer mtx = + unlines + [ dimensionPrefix + , headerRow + , barRow + , renderedRows ] - - barRow = fold - [ " " - , bar maxPrefixWidth - , "+" - , concatMap (const (bar maxColumnWidth)) $ undefined : longerTokens - ] - where - bar n = replicate (n+1) '-' - - renderedRows = unlines $ zipWith3 renderRow [0..] ("*":lesserTokens) matrixTokens - where - renderRow i e cs = prefix <> suffix - where - prefix = fold [" ", pad maxPrefixWidth e, "| "] - suffix = fold $ zipWith (renderCell i) [0..] cs - - renderCell i j = fmap f . pad maxColumnWidth - where - f | (i,j) `elem` tracebackCells = boldDirection - | otherwise = id - - pad :: Int -> String -> String - pad n e = replicate (n - len) ' ' <> e <> " " - where - len = length e - - --- | --- Serializes an alignment matrix to a 'String'. Uses input characters for row --- and column labelings. --- --- Useful for debugging purposes. + where + (colCount, rowCount, lesserTokens, longerTokens, maxPrefixWidth, maxColumnWidth, [matrixTokens]) = + getMatrixConstants lesser longer [mtx] + + tracebackCells = getTracebackIndices mtx + + dimensionPrefix = + " " + <> unwords + [ "Dimensions:" + , show rowCount + , "X" + , show colCount + ] + + headerRow = + fold + [ " " + , pad maxPrefixWidth "\\" + , "| " + , pad maxColumnWidth "*" + , concatMap (pad maxColumnWidth) longerTokens + ] + + barRow = + fold + [ " " + , bar maxPrefixWidth + , "+" + , concatMap (const (bar maxColumnWidth)) $ undefined : longerTokens + ] + where + bar n = replicate (n + 1) '-' + + renderedRows = unlines $ zipWith3 renderRow [0 ..] ("*" : lesserTokens) matrixTokens + where + renderRow i e cs = prefix <> suffix + where + prefix = fold [" ", pad maxPrefixWidth e, "| "] + suffix = fold $ zipWith (renderCell i) [0 ..] cs + + renderCell i j = fmap f . pad maxColumnWidth + where + f + | (i, j) `elem` tracebackCells = boldDirection + | otherwise = id + + pad ∷ Int → String → String + pad n e = replicate (n - len) ' ' <> e <> " " + where + len = length e + + +{- | +Serializes an alignment matrix to a 'String'. Uses input characters for row +and column labelings. + +Useful for debugging purposes. +-} diffDirectionMatrix - :: ( Matrix m t Direction - , Vector v e - ) - => v e - -> v e - -> m t Direction - -> m t Direction - -> String -diffDirectionMatrix lesser longer mtx1 mtx2 = unlines - [ dimensionPrefix - , headerRow - , barRow - , renderedRows - ] - where - (colCount, rowCount, lesserTokens, longerTokens, maxPrefixWidth, maxColumnWidth, [tokMtx1,tokMtx2]) = - getMatrixConstants lesser longer [mtx1, mtx2] - - tracebackCells1 = getTracebackIndices mtx1 - tracebackCells2 = getTracebackIndices mtx2 - - dimensionPrefix = " " <> unwords - [ "Dimensions:" - , show rowCount - , "X" - , show colCount - ] - - headerRow = fold - [ " " - , pad maxPrefixWidth "\\" - , "| " - , pad maxColumnWidth "*" - , concatMap (pad maxColumnWidth) longerTokens - ] - - barRow = fold - [ " " - , bar maxPrefixWidth - , "+" - , concatMap (const (bar maxColumnWidth)) $ undefined : longerTokens + ∷ ( Matrix m t Direction + , Vector v e + ) + ⇒ v e + → v e + → m t Direction + → m t Direction + → String +diffDirectionMatrix lesser longer mtx1 mtx2 = + unlines + [ dimensionPrefix + , headerRow + , barRow + , renderedRows ] - where - bar n = replicate (n+1) '-' - - renderedRows = unlines $ zipWith3 renderRow ("*":lesserTokens) strMtx1 strMtx2 - where - renderRow e xs ys = prefix <> suffix - where - prefix = fold [" ", pad maxPrefixWidth e, "| "] - suffix = fold $ zipWith renderCell xs ys - - renderCell x y - | x' == y' = x - | otherwise = replicate (max (length x) (length y)) ' ' - where - x' = boldDirection <$> x - y' = boldDirection <$> y - - strMtx1 = tok2Str tracebackCells1 tokMtx1 - strMtx2 = tok2Str tracebackCells2 tokMtx2 - - tok2Str s = zipWith f [0..] - where - f i = zipWith (g i) [0..] - g i j = fmap h . pad maxColumnWidth - where - h | (i,j) `member` s = boldDirection - | otherwise = id - - - pad :: Int -> String -> String - pad n e = replicate (n - len) ' ' <> e <> " " - where - len = length e - - --- | --- Get the indices of the traceback route. + where + (colCount, rowCount, lesserTokens, longerTokens, maxPrefixWidth, maxColumnWidth, [tokMtx1, tokMtx2]) = + getMatrixConstants lesser longer [mtx1, mtx2] + + tracebackCells1 = getTracebackIndices mtx1 + tracebackCells2 = getTracebackIndices mtx2 + + dimensionPrefix = + " " + <> unwords + [ "Dimensions:" + , show rowCount + , "X" + , show colCount + ] + + headerRow = + fold + [ " " + , pad maxPrefixWidth "\\" + , "| " + , pad maxColumnWidth "*" + , concatMap (pad maxColumnWidth) longerTokens + ] + + barRow = + fold + [ " " + , bar maxPrefixWidth + , "+" + , concatMap (const (bar maxColumnWidth)) $ undefined : longerTokens + ] + where + bar n = replicate (n + 1) '-' + + renderedRows = unlines $ zipWith3 renderRow ("*" : lesserTokens) strMtx1 strMtx2 + where + renderRow e xs ys = prefix <> suffix + where + prefix = fold [" ", pad maxPrefixWidth e, "| "] + suffix = fold $ zipWith renderCell xs ys + + renderCell x y + | x' == y' = x + | otherwise = replicate (max (length x) (length y)) ' ' + where + x' = boldDirection <$> x + y' = boldDirection <$> y + + strMtx1 = tok2Str tracebackCells1 tokMtx1 + strMtx2 = tok2Str tracebackCells2 tokMtx2 + + tok2Str s = zipWith f [0 ..] + where + f i = zipWith (g i) [0 ..] + g i j = fmap h . pad maxColumnWidth + where + h + | (i, j) `member` s = boldDirection + | otherwise = id + + pad ∷ Int → String → String + pad n e = replicate (n - len) ' ' <> e <> " " + where + len = length e + + +{- | +Get the indices of the traceback route. +-} getTracebackIndices - :: Matrix m t Direction - => m t Direction - -> Set (Int, Int) + ∷ (Matrix m t Direction) + ⇒ m t Direction + → Set (Int, Int) getTracebackIndices mtx = fromDistinctAscList $ go (# m - 1, n - 1 #) - where - getDirection = curry $ unsafeIndex mtx - (m,n) = dim mtx - go (# i, j #) - | i < 0 || j < 0 = [] - | (i,j) == (0,0) = [(0,0)] - | otherwise = - (i,j) : case getDirection i j of - LeftArrow -> go (# i , j - 1 #) - DiagArrow -> go (# i - 1, j - 1 #) - UpArrow -> go (# i - 1, j #) - - -characterVectorToIndices :: Vector v e => v e -> [String] + where + getDirection = curry $ unsafeIndex mtx + (m, n) = dim mtx + go (# i, j #) + | i < 0 || j < 0 = [] + | (i, j) == (0, 0) = [(0, 0)] + | otherwise = + (i, j) : case getDirection i j of + LeftArrow → go (# i, j - 1 #) + DiagArrow → go (# i - 1, j - 1 #) + UpArrow → go (# i - 1, j #) + + +characterVectorToIndices ∷ (Vector v e) ⇒ v e → [String] characterVectorToIndices = - let numbers = tail $ pure <$> cycle ['0'..'9'] + let numbers = tail $ pure <$> cycle ['0' .. '9'] in zipWith const numbers . toList -tokenizeMatrix :: (Matrix m t x, Show x) => m t x -> [[String]] +tokenizeMatrix ∷ (Matrix m t x, Show x) ⇒ m t x → [[String]] tokenizeMatrix = fmap (fmap show) . toLists -maxLengthOfGrid :: (Foldable g, Foldable r, Foldable f, Functor g, Functor r) => g (r (f a)) -> Int +maxLengthOfGrid ∷ (Foldable g, Foldable r, Foldable f, Functor g, Functor r) ⇒ g (r (f a)) → Int maxLengthOfGrid = maximum . fmap maxLengthOfRow -maxLengthOfRow :: (Foldable r, Foldable f, Functor r) => r (f a) -> Int +maxLengthOfRow ∷ (Foldable r, Foldable f, Functor r) ⇒ r (f a) → Int maxLengthOfRow = maximum . fmap length getMatrixConstants - :: ( Matrix m t x - , Show x - , Vector v e - ) - => v e - -> v e - -> [m t x] - -> (Int, Int, [String], [String], Int, Int, [[[String]]]) + ∷ ( Matrix m t x + , Show x + , Vector v e + ) + ⇒ v e + → v e + → [m t x] + → (Int, Int, [String], [String], Int, Int, [[[String]]]) getMatrixConstants lesser longer matrices = (colCount, rowCount, lesserTokens, longerTokens, maxPrefixWidth, maxColumnWidth, matrixTokens) - where - colCount = basicLength longer + 1 - rowCount = basicLength lesser + 1 - lesserTokens = characterVectorToIndices lesser - longerTokens = characterVectorToIndices longer - maxPrefixWidth = maxLengthOfRow lesserTokens - maxHeaderWidth = maxLengthOfRow longerTokens - - matrixTokens = tokenizeMatrix <$> matrices - maxColumnWidth = maximum $ - maxHeaderWidth : (maxLengthOfGrid <$> matrixTokens) - - + where + colCount = basicLength longer + 1 + rowCount = basicLength lesser + 1 + lesserTokens = characterVectorToIndices lesser + longerTokens = characterVectorToIndices longer + maxPrefixWidth = maxLengthOfRow lesserTokens + maxHeaderWidth = maxLengthOfRow longerTokens + + matrixTokens = tokenizeMatrix <$> matrices + maxColumnWidth = + maximum $ + maxHeaderWidth : (maxLengthOfGrid <$> matrixTokens) diff --git a/lib/dynamic-character/src/DirectOptimization/Pairwise/Wide.hs b/lib/dynamic-character/src/DirectOptimization/Pairwise/Wide.hs index a01c47981..3da18b415 100644 --- a/lib/dynamic-character/src/DirectOptimization/Pairwise/Wide.hs +++ b/lib/dynamic-character/src/DirectOptimization/Pairwise/Wide.hs @@ -1,17 +1,18 @@ -module DirectOptimization.Pairwise.Wide - ( WideDynamicCharacter - , WideState - , widePairwiseDO - ) where +module DirectOptimization.Pairwise.Wide ( + WideDynamicCharacter, + WideState, + widePairwiseDO, +) where import Bio.DynamicCharacter +import Bio.DynamicCharacter.Element (WideState) import DirectOptimization.Pairwise.Ukkonen widePairwiseDO - :: Word - -> (WideState -> WideState -> (WideState, Word)) - -> WideDynamicCharacter - -> WideDynamicCharacter - -> (Word, WideDynamicCharacter) + ∷ Word + → (WideState → WideState → (WideState, Word)) + → WideDynamicCharacter + → WideDynamicCharacter + → (Word, WideDynamicCharacter) widePairwiseDO = ukkonenDO diff --git a/lib/dynamic-character/src/DirectOptimization/PreOrder.hs b/lib/dynamic-character/src/DirectOptimization/PreOrder.hs index 6354b873e..7a0a7387c 100644 --- a/lib/dynamic-character/src/DirectOptimization/PreOrder.hs +++ b/lib/dynamic-character/src/DirectOptimization/PreOrder.hs @@ -1,9 +1,9 @@ {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE Strict #-} -module DirectOptimization.PreOrder - ( preOrderLogic - ) where +module DirectOptimization.PreOrder ( + preOrderLogic, +) where import Bio.DynamicCharacter import Control.Monad @@ -12,57 +12,67 @@ import Data.STRef import Data.Vector.Generic (Vector) --- | --- Faithful translation of Algorithm 8 (Non-root Node Alignment) from the --- "Efficient Implied Alignment" paper found at: --- +{- | +Faithful translation of Algorithm 8 (Non-root Node Alignment) from the +"Efficient Implied Alignment" paper found at: + +-} {-# INLINEABLE preOrderLogic #-} -{-# SPECIALISE preOrderLogic :: Bool -> SlimDynamicCharacter -> SlimDynamicCharacter -> SlimDynamicCharacter -> SlimDynamicCharacter #-} -{-# SPECIALISE preOrderLogic :: Bool -> WideDynamicCharacter -> WideDynamicCharacter -> WideDynamicCharacter -> WideDynamicCharacter #-} -{-# SPECIALISE preOrderLogic :: Bool -> HugeDynamicCharacter -> HugeDynamicCharacter -> HugeDynamicCharacter -> HugeDynamicCharacter #-} +{-# SPECIALIZE preOrderLogic ∷ + Bool → SlimDynamicCharacter → SlimDynamicCharacter → SlimDynamicCharacter → SlimDynamicCharacter + #-} +{-# SPECIALIZE preOrderLogic ∷ + Bool → WideDynamicCharacter → WideDynamicCharacter → WideDynamicCharacter → WideDynamicCharacter + #-} +{-# SPECIALIZE preOrderLogic ∷ + Bool → HugeDynamicCharacter → HugeDynamicCharacter → HugeDynamicCharacter → HugeDynamicCharacter + #-} preOrderLogic - :: ( FiniteBits a - , Vector v a - ) - => Bool - -> (v a, v a, v a) -- ^ Parent Final Alignment - -> (v a, v a, v a) -- ^ Parent Preliminary Context - -> (v a, v a, v a) -- ^ Child Preliminary Context - -> (v a, v a, v a) -- ^ Child Final Alignment + ∷ ( FiniteBits a + , Vector v a + ) + ⇒ Bool + → (v a, v a, v a) + -- ^ Parent Final Alignment + → (v a, v a, v a) + -- ^ Parent Preliminary Context + → (v a, v a, v a) + -- ^ Child Preliminary Context + → (v a, v a, v a) + -- ^ Child Final Alignment preOrderLogic isLeftChild pAlignment pContext cContext = forceDynamicCharacter $ unsafeCharacterBuiltByST caLen f - where - ccLen = fromEnum $ characterLength cContext - caLen = characterLength pAlignment - indices = [ 0 .. fromEnum caLen - 1 ] + where + ccLen = fromEnum $ characterLength cContext + caLen = characterLength pAlignment + indices = [0 .. fromEnum caLen - 1] - f char = pokeTempChar char *> fillTempChar char + f char = pokeTempChar char *> fillTempChar char - -- We set an arbitrary element from the parent alignemnt context to the first element of the temporary character. - -- This ensures that at least one element can be querried for the determining "nil" and "gap" values. - pokeTempChar char = setFrom pAlignment char 0 0 + -- We set an arbitrary element from the parent alignemnt context to the first element of the temporary character. + -- This ensures that at least one element can be querried for the determining "nil" and "gap" values. + pokeTempChar char = setFrom pAlignment char 0 0 - -- Select character building function - fillTempChar - | isMissing cContext = missingλ -- Missing case is all gaps - | otherwise = alignmentλ -- Standard pre-order logic + -- Select character building function + fillTempChar + | isMissing cContext = missingλ -- Missing case is all gaps + | otherwise = alignmentλ -- Standard pre-order logic + missingλ char = + forM_ indices (char `setGapped`) - missingλ char = - forM_ indices (char `setGapped`) + alignmentλ char = do + j' ← newSTRef 0 + k' ← newSTRef 0 - alignmentλ char = do - j' <- newSTRef 0 - k' <- newSTRef 0 - - forM_ indices $ \i -> do - k <- readSTRef k' - if k > ccLen || pAlignment `isGapped` i - then char `setGapped` i - else do - j <- readSTRef j' - modifySTRef j' succ - -- Remember that 'Delete' leaves 'voids' in the 'left' character. - if pAlignment `isAlign` i - || (not isLeftChild && pAlignment `isDelete` i && pContext `isDelete` j) - || ( isLeftChild && pAlignment `isInsert` i && pContext `isInsert` j) - then modifySTRef k' succ *> setFrom cContext char k i - else char `setGapped` i + forM_ indices $ \i → do + k ← readSTRef k' + if k > ccLen || pAlignment `isGapped` i + then char `setGapped` i + else do + j ← readSTRef j' + modifySTRef j' succ + -- Remember that 'Delete' leaves 'voids' in the 'left' character. + if pAlignment `isAlign` i + || (not isLeftChild && pAlignment `isDelete` i && pContext `isDelete` j) + || (isLeftChild && pAlignment `isInsert` i && pContext `isInsert` j) + then modifySTRef k' succ *> setFrom cContext char k i + else char `setGapped` i diff --git a/lib/dynamic-character/src/DirectOptimization/Wrapper/DOHuge.hs b/lib/dynamic-character/src/DirectOptimization/Wrapper/DOHuge.hs index c2f6abf5c..69c2f5d8b 100644 --- a/lib/dynamic-character/src/DirectOptimization/Wrapper/DOHuge.hs +++ b/lib/dynamic-character/src/DirectOptimization/Wrapper/DOHuge.hs @@ -1,25 +1,26 @@ module DirectOptimization.DOHuge where import Analysis.Parsimony.Dynamic.DirectOptimization.Pairwise.Huge -import Data.BitVector.LittleEndian (BitVector, dimension) +import Data.BitVector.LittleEndian (BitVector, dimension) import Data.Bits import Data.Foldable import Data.MetricRepresentation -import Data.Vector (Vector, head) -import Prelude hiding (head) +import Data.Vector (Vector, head) +import Prelude hiding (head) + wrapperHugeDO - :: Vector BitVector - -> Vector BitVector - -> MetricRepresentation BitVector - -> (Vector BitVector, Int) + :: Vector BitVector + -> Vector BitVector + -> MetricRepresentation BitVector + -> (Vector BitVector, Int) wrapperHugeDO lhs rhs tcmMemo = (medians, fromEnum cost) - where - (cost, (medians,_,_)) = hugePairwiseDO (minInDelCost tcmMemo) gapState (retreivePairwiseTCM tcmMemo) (lhs, lhs, lhs) (rhs, rhs, rhs) + where + (cost, (medians, _, _)) = hugePairwiseDO (minInDelCost tcmMemo) gapState (retreivePairwiseTCM tcmMemo) (lhs, lhs, lhs) (rhs, rhs, rhs) - gapState = bit . fromEnum $ n - 1 - n = case length lhs of - 0 -> case length rhs of - 0 -> 64 - _ -> dimension $ head rhs - _ -> dimension $ head lhs + gapState = bit . fromEnum $ n - 1 + n = case length lhs of + 0 -> case length rhs of + 0 -> 64 + _ -> dimension $ head rhs + _ -> dimension $ head lhs diff --git a/lib/dynamic-character/src/DirectOptimization/Wrapper/DOSlim.hs b/lib/dynamic-character/src/DirectOptimization/Wrapper/DOSlim.hs index 661b6de5f..7eed74a3a 100644 --- a/lib/dynamic-character/src/DirectOptimization/Wrapper/DOSlim.hs +++ b/lib/dynamic-character/src/DirectOptimization/Wrapper/DOSlim.hs @@ -1,31 +1,32 @@ module DirectOptimization.DOSlim where -import Analysis.Parsimony.Dynamic.DirectOptimization.Pairwise.Slim -import Data.BitVector.LittleEndian (BitVector) -import qualified Data.BitVector.LittleEndian as BV -import Data.Bits -import Data.TCM.Dense -import Data.Vector (Vector, (!)) -import qualified Data.Vector as V -import qualified Data.Vector.Storable as SV -import Foreign.C.Types (CUInt) +import Analysis.Parsimony.Dynamic.DirectOptimization.Pairwise.Slim +import Data.BitVector.LittleEndian (BitVector) +import qualified Data.BitVector.LittleEndian as BV +import Data.Bits +import Data.TCM.Dense +import Data.Vector (Vector, (!)) +import qualified Data.Vector as V +import qualified Data.Vector.Storable as SV +import Foreign.C.Types (CUInt) wrapperSlimDO :: Vector BitVector -> Vector BitVector -> DenseTransitionCostMatrix -> (Vector BitVector, Int) - -- wrapperPCG_DO_FFI lhs rhs tcm | trace (show tcm) False= undefined wrapperSlimDO lhs rhs tcmDense = (resultMedians, fromEnum resultCost) - where - (resultCost, resultMedians) = slimDC2BVs 5 <$> slimPairwiseDO tcmDense lhsDC rhsDC + where + (resultCost, resultMedians) = slimDC2BVs 5 <$> slimPairwiseDO tcmDense lhsDC rhsDC - lhsDC = bvs2SlimDC lhs - rhsDC = bvs2SlimDC rhs - {-} - tcmDense = generateDenseTransitionCostMatrix 0 5 getCost - getCost i j = - let x = SM.getFullVects tcm - in toEnum $ (x ! fromEnum i) ! fromEnum j - -} + lhsDC = bvs2SlimDC lhs + rhsDC = bvs2SlimDC rhs + + +{-} +tcmDense = generateDenseTransitionCostMatrix 0 5 getCost +getCost i j = + let x = SM.getFullVects tcm + in toEnum $ (x ! fromEnum i) ! fromEnum j + -} {- wrapperSlimDO :: Vector BitVector -> Vector BitVector -> Vector (Vector Int) -> (Vector BitVector, Int) @@ -42,29 +43,28 @@ wrapperSlimDO lhs rhs tcm = (resultMedians, fromEnum resultCost) in toEnum $ (x ! fromEnum i) ! fromEnum j -} ---specializedAlphabetToDNA :: Alphabet String ---specializedAlphabetToDNA = fromSymbols $ show <$> (0 :: Word) :| [1 .. 4] - +-- specializedAlphabetToDNA :: Alphabet String +-- specializedAlphabetToDNA = fromSymbols $ show <$> (0 :: Word) :| [1 .. 4] bvs2SlimDC :: V.Vector BitVector -> SlimDynamicCharacter -bvs2SlimDC v = (x,x,x) - where - x = SV.generate (V.length v) $ \i -> bv2w (v ! i) +bvs2SlimDC v = (x, x, x) + where + x = SV.generate (V.length v) $ \i -> bv2w (v ! i) - bv2w :: BitVector -> CUInt - bv2w bv = - let f i a - | bv `testBit` i = a `setBit` i - | otherwise = a - in foldr f 0 [0 .. fromEnum $ BV.dimension bv - 1] + bv2w :: BitVector -> CUInt + bv2w bv = + let f i a + | bv `testBit` i = a `setBit` i + | otherwise = a + in foldr f 0 [0 .. fromEnum $ BV.dimension bv - 1] slimDC2BVs :: Int -> SlimDynamicCharacter -> V.Vector BitVector -slimDC2BVs n (x,_,_) = V.generate (SV.length x) $ \i -> w2bv (x SV.! i) - where - w2bv :: CUInt -> BitVector - w2bv w = - let f i a - | w `testBit` i = a `setBit` i - | otherwise = a - in foldr f (BV.fromNumber (toEnum n) (0:: Word)) [0 .. n - 1] +slimDC2BVs n (x, _, _) = V.generate (SV.length x) $ \i -> w2bv (x SV.! i) + where + w2bv :: CUInt -> BitVector + w2bv w = + let f i a + | w `testBit` i = a `setBit` i + | otherwise = a + in foldr f (BV.fromNumber (toEnum n) (0 :: Word)) [0 .. n - 1] diff --git a/lib/dynamic-character/src/DirectOptimization/Wrapper/DOWide.hs b/lib/dynamic-character/src/DirectOptimization/Wrapper/DOWide.hs index 38fdf9f92..3c818bfdc 100644 --- a/lib/dynamic-character/src/DirectOptimization/Wrapper/DOWide.hs +++ b/lib/dynamic-character/src/DirectOptimization/Wrapper/DOWide.hs @@ -1,63 +1,62 @@ module DirectOptimization.DOWide where -import Analysis.Parsimony.Dynamic.DirectOptimization.Pairwise.Wide ---import Analysis.Parsimony.Dynamic.DirectOptimization.Pairwise.Ukkonen2 -import Control.Arrow (first) -import Data.BitVector.LittleEndian (BitVector) -import qualified Data.BitVector.LittleEndian as BV -import Data.Bits -import Data.MetricRepresentation -import Data.Vector (Vector, (!)) -import qualified Data.Vector as V -import qualified Data.Vector.Unboxed as UV -import Data.Word +import Analysis.Parsimony.Dynamic.DirectOptimization.Pairwise.Wide +-- import Analysis.Parsimony.Dynamic.DirectOptimization.Pairwise.Ukkonen2 +import Control.Arrow (first) +import Data.BitVector.LittleEndian (BitVector) +import qualified Data.BitVector.LittleEndian as BV +import Data.Bits +import Data.MetricRepresentation +import Data.Vector (Vector, (!)) +import qualified Data.Vector as V +import qualified Data.Vector.Unboxed as UV +import Data.Word wrapperWideDO - :: Vector BitVector - -> Vector BitVector - -> MetricRepresentation BitVector --Word64 - -> (Vector BitVector, Int) + :: Vector BitVector + -> Vector BitVector + -> MetricRepresentation BitVector -- Word64 + -> (Vector BitVector, Int) wrapperWideDO lhs rhs metric = (wideDC2BVs (fromIntegral n) resultMedians, fromEnum resultCost) - where - (resultCost, resultMedians) = widePairwiseDO (minInDelCost metric) gap tcm lhsDC rhsDC --- (resultCost, resultMedians) = ukkonenDO (minInDelCost metric) gap tcm lhsDC rhsDC + where + (resultCost, resultMedians) = widePairwiseDO (minInDelCost metric) gap tcm lhsDC rhsDC + -- (resultCost, resultMedians) = ukkonenDO (minInDelCost metric) gap tcm lhsDC rhsDC - gap = bit . fromEnum $ n - 1 + gap = bit . fromEnum $ n - 1 - tcm :: Word64 -> Word64 -> (Word64, Word) - tcm x y = first BV.toUnsignedNumber (retreivePairwiseTCM metric) (BV.fromNumber 64 x) (BV.fromNumber 64 y) + tcm :: Word64 -> Word64 -> (Word64, Word) + tcm x y = first BV.toUnsignedNumber (retreivePairwiseTCM metric) (BV.fromNumber 64 x) (BV.fromNumber 64 y) + n = case length lhs of + 0 -> case length rhs of + 0 -> 64 + _ -> BV.dimension $ V.head rhs + _ -> BV.dimension $ V.head lhs - n = case length lhs of - 0 -> case length rhs of - 0 -> 64 - _ -> BV.dimension $ V.head rhs - _ -> BV.dimension $ V.head lhs - - lhsDC = bvs2WideDC lhs - rhsDC = bvs2WideDC rhs + lhsDC = bvs2WideDC lhs + rhsDC = bvs2WideDC rhs bvs2WideDC :: V.Vector BitVector -> WideDynamicCharacter -bvs2WideDC v = (x,x,x) - where - x = UV.generate (V.length v) $ \i -> bv2w (v ! i) +bvs2WideDC v = (x, x, x) + where + x = UV.generate (V.length v) $ \i -> bv2w (v ! i) - bv2w :: BitVector -> Word64 - bv2w bv = - let f i a - | bv `testBit` i = a `setBit` i - | otherwise = a - in foldr f 0 [0 .. fromEnum $ BV.dimension bv - 1] + bv2w :: BitVector -> Word64 + bv2w bv = + let f i a + | bv `testBit` i = a `setBit` i + | otherwise = a + in foldr f 0 [0 .. fromEnum $ BV.dimension bv - 1] wideDC2BVs :: Int -> WideDynamicCharacter -> V.Vector BitVector -wideDC2BVs n (x,_,_) = V.generate (UV.length x) $ \i -> w2bv (x UV.! i) - where - w2bv :: Word64 -> BitVector - w2bv w = - let f i a - | w `testBit` i = a `setBit` i - | otherwise = a - in foldr f (BV.fromNumber (toEnum n) (0 :: Word)) [0 .. n - 1] +wideDC2BVs n (x, _, _) = V.generate (UV.length x) $ \i -> w2bv (x UV.! i) + where + w2bv :: Word64 -> BitVector + w2bv w = + let f i a + | w `testBit` i = a `setBit` i + | otherwise = a + in foldr f (BV.fromNumber (toEnum n) (0 :: Word)) [0 .. n - 1]