From 9ed642c0a38e9be44085aa3de0337bc41c075121 Mon Sep 17 00:00:00 2001 From: ocramz Date: Thu, 9 Apr 2020 11:18:44 +0200 Subject: [PATCH 1/2] add Data.Align.Text --- CHANGELOG.md | 4 +++ align.cabal | 7 +++- src/Data/Align.hs | 66 +++++-------------------------------- src/Data/Align/Demo.hs | 10 ++++++ src/Data/Align/Text.hs | 60 ++++++++++++++++++++++++++++++++++ src/Data/Align/Types.hs | 72 +++++++++++++++++++++++++++++++++++++++++ stack.yaml | 30 +++++++++++++++++ 7 files changed, 190 insertions(+), 59 deletions(-) create mode 100644 CHANGELOG.md create mode 100644 src/Data/Align/Text.hs create mode 100644 src/Data/Align/Types.hs create mode 100644 stack.yaml diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..2d70482 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,4 @@ +0.2 + +* Add 'text' dependency +* Add 'Data.Align.Text' diff --git a/align.cabal b/align.cabal index 2f333fa..40c311d 100644 --- a/align.cabal +++ b/align.cabal @@ -1,5 +1,5 @@ name: align -version: 0.1.1.2 +version: 0.2 synopsis: Sequence alignment algorithms. description: Global or local sequence alignment, not exclusively for text. license: BSD3 @@ -9,15 +9,20 @@ maintainer: palotai.robin@gmail.com category: Data build-type: Simple cabal-version: >=1.10 +extra-source-files: CHANGELOG.md + source-repository head type: git location: https://github.com/robinp/align.git library exposed-modules: Data.Align, + Data.Align.Text, Data.Align.Demo + other-modules: Data.Align.Types build-depends: base >=4.6 && <5, containers, + text, transformers, vector >=0.10 hs-source-dirs: src diff --git a/src/Data/Align.hs b/src/Data/Align.hs index 3a0cae3..c871087 100644 --- a/src/Data/Align.hs +++ b/src/Data/Align.hs @@ -1,4 +1,5 @@ {-# LANGUAGE RecordWildCards #-} +{-# OPTIONS_GHC -Wno-name-shadowing #-} -- | Collection of functions for global, local and multi-sequence alignment. module Data.Align ( @@ -20,7 +21,7 @@ module Data.Align ) where import Control.Monad.Trans.State.Strict -import Data.Function (fix, on) +import Data.Function (on) import qualified Data.List as L import qualified Data.Map.Strict as M import Data.Maybe (fromMaybe) @@ -28,58 +29,8 @@ import Data.Ord import qualified Data.Vector as V import qualified Data.Vector.Generic as G -data AlignConfig a s = AlignConfig - { acPairScore :: a -> a -> s - , ac_initial_gap_penalty :: s - , ac_gap_penalty :: s - } - --- | Configures the scores used when aligning. --- The gap scores should be negative in order to be penalties. -alignConfig :: (a -> a -> s) -- ^ Scoring function. - -> s -- ^ Initial gap score. - -> s -- ^ Gap score. - -> AlignConfig a s -alignConfig = AlignConfig - --- | Configuration for local alignment. -localAlignConfig - :: Num s - => (a -> a -> s) -- ^ Scoring function. - -> s -- ^ Gap score. - -> AlignConfig a s -localAlignConfig f = alignConfig f 0 - --- | Either an unmatched item or a match. -type Step a = Either (Either a a) (a, a) - -stepLeft = Left . Left -stepRight = Left . Right -stepBoth a b = Right (a,b) - -isMatch :: Step a -> Bool -isMatch (Right _) = True -isMatch _ = False - -isLeft :: Step a -> Bool -isLeft (Left (Left _)) = True -isLeft _ = False - -isRight :: Step a -> Bool -isRight (Left (Right _)) = True -isRight _ = False - --- | The result of the alignment. -data Trace a s = Trace - { traceScore :: s - , trace :: [Step a] - } - -instance (Show a, Show s) => Show (Trace a s) where - show (Trace s t) = "Trace(score = " ++ show s ++ ", steps = " ++ show t ++ ")" +import Data.Align.Types -mt `tappend` (Trace z (t:_)) = - fmap (\(Trace s ts) -> Trace (s+z) (t:ts)) mt -- | Utility for displaying a Char-based alignment. debugAlign :: [Step Char] -> String @@ -108,18 +59,17 @@ debugAlign = go [] [] -- -applied align :: (G.Vector v a, Num s, Ord s) => AlignConfig a s - -> v a -- ^ Left sequence. - -> v a -- ^ Right sequence. + -> v a -- ^ Left sequence + -> v a -- ^ Right sequence -> Trace a s -align AlignConfig{..} as bs = - let p = (lastIndex as, lastIndex bs) - in revTrace $ evalState (go p) M.empty +align AlignConfig{..} as bs = revTrace $ evalState (go p) M.empty where + p = (lastIndex as, lastIndex bs) revTrace (Trace s t) = Trace s (reverse t) lastIndex v = G.length v - 1 -- go p = do - res <- gets $ M.lookup p + res <- gets (M.lookup p) case res of Just r -> return r Nothing -> do diff --git a/src/Data/Align/Demo.hs b/src/Data/Align/Demo.hs index 1333f82..e410f8a 100644 --- a/src/Data/Align/Demo.hs +++ b/src/Data/Align/Demo.hs @@ -12,19 +12,24 @@ sampleGlobalConfig = alignConfig (-0.5) (-1) +testIn1, testIn2 :: String testIn1 = "dopple" testIn2 = "applied" +alignedGlobal :: Trace Char Double alignedGlobal = align sampleGlobalConfig (V.fromList testIn1) (V.fromList testIn2) +debug :: Trace Char s -> IO () debug = putStrLn . debugAlign . trace +printAlignedGlobal :: IO () printAlignedGlobal = debug alignedGlobal -- * Multi-sequence fun. -- | Example from https://www.biostat.wisc.edu/bmi576/lectures/multiple-alignment.pdf +nucs :: [String] nucs = [ "ATTGCCATT" , "ATGGCCATT" @@ -33,15 +38,20 @@ nucs = , "ATTGCCGATT" ] +alignNuc :: Eq a => [a] -> [a] -> Trace a Double alignNuc a b = align sampleGlobalConfig (V.fromList a) (V.fromList b) +alignedNucPairs :: [Trace Char Double] alignedNucPairs = do n <- tail nucs return $ alignNuc (head nucs) n +printAlignedNucPairs :: IO () printAlignedNucPairs = mapM_ (\x -> debug x >> putStrLn "") alignedNucPairs +alignedNucStar :: MultiTrace Integer Char Double alignedNucStar = centerStar sampleGlobalConfig $ zip [1..] (map V.fromList nucs) +printAlignedNucStar :: IO () printAlignedNucStar = putStrLn . debugMultiAlign . multiTrace $ alignedNucStar diff --git a/src/Data/Align/Text.hs b/src/Data/Align/Text.hs new file mode 100644 index 0000000..96dfc5f --- /dev/null +++ b/src/Data/Align/Text.hs @@ -0,0 +1,60 @@ +{-# language RecordWildCards #-} +{-# language OverloadedStrings #-} +module Data.Align.Text ( + -- * Text + align + ) where + +import qualified Data.List as L (maximumBy) +import Data.Ord (comparing) + +-- containers +import qualified Data.Map as M +-- text +import qualified Data.Text as T (Text, length, index) +-- transformers +import Control.Monad.Trans.State.Strict + + +import Data.Align.Types + + +-- | 'Data.Align.align', specialized to 'Data.Text.Text' +align :: (Num s, Ord s) => + AlignConfig Char s + -> T.Text -- ^ Left sequence + -> T.Text -- ^ Right sequence + -> Trace Char s +align AlignConfig{..} as bs = revTrace $ evalState (go p) M.empty + where + p = (lastIndex as, lastIndex bs) + revTrace (Trace s t) = Trace s (reverse t) + lastIndex v = T.length v - 1 + -- + go ij = do + res <- gets (M.lookup ij) + case res of + Just r -> return r + Nothing -> do + newRes <- pgo ij + modify (M.insert ij newRes) + return newRes + -- + pgo (i, j) + | i == (-1) || j == (-1) = return $ + if i == j then Trace 0 [] + else if i == (-1) + then skipInit j stepRight bs + else skipInit i stepLeft as + | otherwise = do + let a = as `T.index` i + b = bs `T.index` j + diag <- go (i-1,j-1) `tappend` Trace (acPairScore a b) [stepBoth a b] + a_gap <- go (i-1, j) `tappend` Trace ac_gap_penalty [stepLeft a] + b_gap <- go ( i,j-1) `tappend` Trace ac_gap_penalty [stepRight b] + return $ L.maximumBy (comparing traceScore) [diag, a_gap, b_gap] + -- + skipInit idx stepFun xs = + let score = ac_initial_gap_penalty * fromIntegral (idx+1) + tr = reverse [stepFun (xs `T.index` xi) | xi <- [0..idx]] + in Trace score tr diff --git a/src/Data/Align/Types.hs b/src/Data/Align/Types.hs new file mode 100644 index 0000000..f838a40 --- /dev/null +++ b/src/Data/Align/Types.hs @@ -0,0 +1,72 @@ +module Data.Align.Types where + + +data AlignConfig a s = AlignConfig + { acPairScore :: a -> a -> s + , ac_initial_gap_penalty :: s + , ac_gap_penalty :: s + } + +eqAlignConfig :: Eq a => + s -- ^ match score + -> s -- ^ mismatch score + -> s -- ^ initial gap penalty + -> s -- ^ gap penalty + -> AlignConfig a s +eqAlignConfig ms mms = + alignConfig (\a b -> if a == b then ms else mms) + +-- | Configures the scores used when aligning. +-- The gap scores should be negative in order to be penalties. +alignConfig :: (a -> a -> s) -- ^ Scoring function. + -> s -- ^ Initial gap score. + -> s -- ^ Gap score. + -> AlignConfig a s +alignConfig = AlignConfig + +-- | Configuration for local alignment. +localAlignConfig + :: Num s + => (a -> a -> s) -- ^ Scoring function. + -> s -- ^ Gap score. + -> AlignConfig a s +localAlignConfig f = alignConfig f 0 + +-- | Either an unmatched item or a match. +type Step a = Either (Either a a) (a, a) + +stepLeft :: a -> Either (Either a b1) b2 +stepLeft = Left . Left +stepRight :: b1 -> Either (Either a b1) b2 +stepRight = Left . Right +stepBoth :: a1 -> b -> Either a2 (a1, b) +stepBoth a b = Right (a,b) + +isMatch :: Step a -> Bool +isMatch (Right _) = True +isMatch _ = False + +isLeft :: Step a -> Bool +isLeft (Left (Left _)) = True +isLeft _ = False + +isRight :: Step a -> Bool +isRight (Left (Right _)) = True +isRight _ = False + +-- | The result of the alignment. +data Trace a s = Trace + { traceScore :: s + , trace :: [Step a] + } + +instance Semigroup s => Semigroup (Trace a s) where + (Trace s1 t1) <> (Trace s2 t2) = Trace (s1 <> s2) (t1 ++ t2) + +instance (Show a, Show s) => Show (Trace a s) where + show (Trace s t) = "Trace(score = " ++ show s ++ ", steps = " ++ show t ++ ")" + +tappend :: (Functor f, Num s) => + f (Trace a s) -> Trace a s -> f (Trace a s) +mt `tappend` (Trace z (t:_)) = + fmap (\(Trace s ts) -> Trace (s+z) (t:ts)) mt diff --git a/stack.yaml b/stack.yaml new file mode 100644 index 0000000..ab6fbce --- /dev/null +++ b/stack.yaml @@ -0,0 +1,30 @@ +resolver: lts-14.27 + +packages: +- . + +# extra-deps: [] + +# Override default flag values for local packages and extra-deps +# flags: {} + +# Extra package databases containing global packages +# extra-package-dbs: [] + +# Control whether we use the GHC we find on the path +# system-ghc: true +# +# Require a specific version of stack, using version ranges +# require-stack-version: -any # Default +# require-stack-version: ">=2.1" +# +# Override the architecture used by stack, especially useful on Windows +# arch: i386 +# arch: x86_64 +# +# Extra directories used by stack for building +# extra-include-dirs: [/path/to/dir] +# extra-lib-dirs: [/path/to/dir] +# +# Allow a newer minor version of GHC than the snapshot specifies +# compiler-check: newer-minor From 60ef1f1429c84371f8f0d83dfa4b87c8ec4c3bbe Mon Sep 17 00:00:00 2001 From: ocramz Date: Fri, 10 Apr 2020 22:10:37 +0200 Subject: [PATCH 2/2] apply review comments --- src/Data/Align/Text.hs | 3 --- src/Data/Align/Types.hs | 26 ++++++++++++++------------ stack.yaml | 26 -------------------------- 3 files changed, 14 insertions(+), 41 deletions(-) diff --git a/src/Data/Align/Text.hs b/src/Data/Align/Text.hs index 96dfc5f..30a8385 100644 --- a/src/Data/Align/Text.hs +++ b/src/Data/Align/Text.hs @@ -8,11 +8,8 @@ module Data.Align.Text ( import qualified Data.List as L (maximumBy) import Data.Ord (comparing) --- containers import qualified Data.Map as M --- text import qualified Data.Text as T (Text, length, index) --- transformers import Control.Monad.Trans.State.Strict diff --git a/src/Data/Align/Types.hs b/src/Data/Align/Types.hs index f838a40..df5726b 100644 --- a/src/Data/Align/Types.hs +++ b/src/Data/Align/Types.hs @@ -1,4 +1,13 @@ -module Data.Align.Types where +module Data.Align.Types ( + AlignConfig(..) + , alignConfig + , localAlignConfig + , Step + , stepLeft, stepRight, stepBoth + , isMatch, isLeft, isRight + , Trace(..) + , tappend + ) where data AlignConfig a s = AlignConfig @@ -7,20 +16,13 @@ data AlignConfig a s = AlignConfig , ac_gap_penalty :: s } -eqAlignConfig :: Eq a => - s -- ^ match score - -> s -- ^ mismatch score - -> s -- ^ initial gap penalty - -> s -- ^ gap penalty - -> AlignConfig a s -eqAlignConfig ms mms = - alignConfig (\a b -> if a == b then ms else mms) -- | Configures the scores used when aligning. +-- -- The gap scores should be negative in order to be penalties. -alignConfig :: (a -> a -> s) -- ^ Scoring function. - -> s -- ^ Initial gap score. - -> s -- ^ Gap score. +alignConfig :: (a -> a -> s) -- ^ Scoring function + -> s -- ^ Initial gap score + -> s -- ^ Gap score -> AlignConfig a s alignConfig = AlignConfig diff --git a/stack.yaml b/stack.yaml index ab6fbce..a55ce3d 100644 --- a/stack.yaml +++ b/stack.yaml @@ -2,29 +2,3 @@ resolver: lts-14.27 packages: - . - -# extra-deps: [] - -# Override default flag values for local packages and extra-deps -# flags: {} - -# Extra package databases containing global packages -# extra-package-dbs: [] - -# Control whether we use the GHC we find on the path -# system-ghc: true -# -# Require a specific version of stack, using version ranges -# require-stack-version: -any # Default -# require-stack-version: ">=2.1" -# -# Override the architecture used by stack, especially useful on Windows -# arch: i386 -# arch: x86_64 -# -# Extra directories used by stack for building -# extra-include-dirs: [/path/to/dir] -# extra-lib-dirs: [/path/to/dir] -# -# Allow a newer minor version of GHC than the snapshot specifies -# compiler-check: newer-minor