haskell · jappeace · Nov 5, 2023 · Nov 13, 2023 · Nov 18, 2023 · Nov 18, 2023
diff --git a/Cabal-syntax/Cabal-syntax.cabal b/Cabal-syntax/Cabal-syntax.cabal
@@ -197,6 +197,7 @@ library
     Distribution.Utils.String
     Distribution.Utils.Structured
     Distribution.Version
+    Distribution.PackageDescription.ExactPrint
     Language.Haskell.Extension
 
   other-extensions:

diff --git a/Cabal-syntax/src/Distribution/Fields/Field.hs b/Cabal-syntax/src/Distribution/Fields/Field.hs
@@ -2,6 +2,8 @@
 {-# LANGUAGE DeriveFoldable #-}
 {-# LANGUAGE DeriveFunctor #-}
 {-# LANGUAGE DeriveTraversable #-}
+{-# LANGUAGE LambdaCase #-}
+{-# LANGUAGE StrictData #-}
 
 -- | Cabal-like file AST types: 'Field', 'Section' etc
 --
@@ -12,11 +14,13 @@ module Distribution.Fields.Field
   , fieldName
   , fieldAnn
   , fieldUniverse
+  , fieldMeta
   , FieldLine (..)
   , fieldLineAnn
   , fieldLineBS
   , SectionArg (..)
   , sectionArgAnn
+  , sectionArgContent
 
     -- * Name
   , FieldName
@@ -28,6 +32,12 @@ module Distribution.Fields.Field
     -- * Conversions to String
   , sectionArgsToString
   , fieldLinesToString
+
+  -- * meta data
+  , MetaField(..)
+  , fieldMeta
+  , metaComment
+  , metaAnn
   ) where
 
 import Data.ByteString (ByteString)
@@ -47,30 +57,55 @@ import qualified Data.Foldable1 as F1
 
 -- | A Cabal-like file consists of a series of fields (@foo: bar@) and sections (@library ...@).
 data Field ann
-  = Field !(Name ann) [FieldLine ann]
-  | Section !(Name ann) [SectionArg ann] [Field ann]
+  = Field (Name ann) [FieldLine ann]
+  | Section (Name ann) [SectionArg ann] [Field ann]
+  | Meta (MetaField ann)
+  deriving (Eq, Show, Functor, Foldable, Traversable)
+
+data MetaField ann = MetaComment ann ByteString
+                   | MetaWhitespace ann ByteString
   deriving (Eq, Show, Functor, Foldable, Traversable)
 
+metaComment :: MetaField ann -> Maybe ByteString
+metaComment = \case
+  (MetaComment _ bs) -> Just bs
+  (MetaWhitespace _ _) -> Nothing
+
+metaAnn :: MetaField ann -> ann
+metaAnn = \case
+  (MetaComment ann _) -> ann
+  (MetaWhitespace ann _) ->  ann
+
 -- | Section of field name
-fieldName :: Field ann -> Name ann
-fieldName (Field n _) = n
-fieldName (Section n _ _) = n
+fieldName :: Field ann -> (Maybe (Name ann))
+fieldName (Field n _) = Just n
+fieldName (Section n _ _) = Just n
+fieldName (Meta _) = Nothing
+
+fieldMeta :: Field ann -> Maybe (MetaField ann)
+fieldMeta = \case
+  (Field n _) -> Nothing
+  (Section n _ _) -> Nothing
+  (Meta x) -> Just x
 
 fieldAnn :: Field ann -> ann
-fieldAnn = nameAnn . fieldName
+fieldAnn (Field n _) = nameAnn n
+fieldAnn (Section n _ _) = nameAnn n
+fieldAnn (Meta x) = metaAnn x
 
 -- | All transitive descendants of 'Field', including itself.
 --
 -- /Note:/ the resulting list is never empty.
 fieldUniverse :: Field ann -> [Field ann]
 fieldUniverse f@(Section _ _ fs) = f : concatMap fieldUniverse fs
 fieldUniverse f@(Field _ _) = [f]
+fieldUniverse (Meta _) = []
 
 -- | A line of text representing the value of a field from a Cabal file.
 -- A field may contain multiple lines.
 --
 -- /Invariant:/ 'ByteString' has no newlines.
-data FieldLine ann = FieldLine !ann !ByteString
+data FieldLine ann = FieldLine ann ByteString
   deriving (Eq, Show, Functor, Foldable, Traversable)
 
 -- | @since 3.0.0.0
@@ -84,11 +119,11 @@ fieldLineBS (FieldLine _ bs) = bs
 -- | Section arguments, e.g. name of the library
 data SectionArg ann
   = -- | identifier, or something which looks like number. Also many dot numbers, i.e. "7.6.3"
-    SecArgName !ann !ByteString
+    SecArgName ann ByteString
   | -- | quoted string
-    SecArgStr !ann !ByteString
+    SecArgStr ann ByteString
   | -- | everything else, mm. operators (e.g. in if-section conditionals)
-    SecArgOther !ann !ByteString
+    SecArgOther ann ByteString
   deriving (Eq, Show, Functor, Foldable, Traversable)
 
 -- | Extract annotation from 'SectionArg'.
@@ -97,6 +132,12 @@ sectionArgAnn (SecArgName ann _) = ann
 sectionArgAnn (SecArgStr ann _) = ann
 sectionArgAnn (SecArgOther ann _) = ann
 
+sectionArgContent :: SectionArg ann -> ByteString
+sectionArgContent = \case
+    SecArgName _ann bs ->  bs
+    SecArgStr _ann bs -> bs
+    SecArgOther _ann bs -> bs
+
 -------------------------------------------------------------------------------
 -- Name
 -------------------------------------------------------------------------------
@@ -106,7 +147,7 @@ type FieldName = ByteString
 -- | A field name.
 --
 -- /Invariant/: 'ByteString' is lower-case ASCII.
-data Name ann = Name !ann !FieldName
+data Name ann = Name ann FieldName
   deriving (Eq, Show, Functor, Foldable, Traversable)
 
 mkName :: ann -> FieldName -> Name ann
@@ -158,6 +199,7 @@ instance F1.Foldable1 Field where
     F1.fold1 (F1.foldMap1 f x :| map (F1.foldMap1 f) ys)
   foldMap1 f (Section x ys zs) =
     F1.fold1 (F1.foldMap1 f x :| map (F1.foldMap1 f) ys ++ map (F1.foldMap1 f) zs)
+  foldMap1 f (Meta x) = f $ metaAnn x
 
 -- | @since 3.12.0.0
 instance F1.Foldable1 FieldLine where

diff --git a/Cabal-syntax/src/Distribution/Fields/Lexer.x b/Cabal-syntax/src/Distribution/Fields/Lexer.x
@@ -15,7 +15,7 @@
 #endif
 {-# OPTIONS_GHC -fno-warn-unused-imports #-}
 module Distribution.Fields.Lexer
-  (ltest, lexToken, Token(..), LToken(..)
+  (ltest, lexString, lexByteString, lexToken, Token(..), LToken(..)
   ,bol_section, in_section, in_field_layout, in_field_braces
   ,mkLexState) where
 
@@ -82,85 +82,102 @@ tokens :-
 }
 
 <bol_section, bol_field_layout, bol_field_braces> {
-  @nbspspacetab* @nl         { \pos len inp -> checkWhitespace pos len inp >> adjustPos retPos >> lexToken }
-  -- no @nl here to allow for comments on last line of the file with no trailing \n
-  $spacetab* "--" $comment*  ;  -- TODO: check the lack of @nl works here
-                                -- including counting line numbers
+  @nbspspacetab* @nl         { \pos len inp -> do
+                                  _ <- checkWhitespace pos len inp
+                                  adjustPos retPos
+                                  toki Whitespace pos len inp }
+  -- FIXME: no @nl here to allow for comments on last line of the file with no trailing \n
+  -- FIXME: TODO: check the lack of @nl works here including counting line numbers
+  $spacetab* "--" $comment*  { toki Comment }
 }
 
 <bol_section> {
-  @nbspspacetab*   { \pos len inp -> checkLeadingWhitespace pos len inp >>= \len' ->
-                                     -- len' is character whitespace length (counting nbsp as one)
-                                     if B.length inp == len
-                                       then return (L pos EOF)
-                                       else do
-                                        -- Small hack: if char and byte length mismatch
-                                        -- subtract the difference, so lexToken will count position correctly.
-                                        -- Proper (and slower) fix is to count utf8 length in lexToken
-                                        when (len' /= len) $ adjustPos (incPos (len' - len))
-                                        setStartCode in_section
-                                        return (L pos (Indent len')) }
+  @nbspspacetab*   { \pos len inp -> do
+                        len' <- checkLeadingWhitespace pos len inp
+                        -- len' is character whitespace length (counting nbsp as one)
+                        if B.length inp == len
+                          then return (L pos EOF)
+                          else do
+                          -- Small hack: if char and byte length mismatch
+                          -- subtract the difference, so lexToken will count position correctly.
+                          -- Proper (and slower) fix is to count utf8 length in lexToken
+                          when (len' /= len) $ adjustPos (incPos (len' - len))
+                          setStartCode in_section
+                          return (L pos (Indent len')) }
   $spacetab* \{    { tok  OpenBrace }
   $spacetab* \}    { tok  CloseBrace }
 }
 
 <in_section> {
-  $spacetab+   ; --TODO: don't allow tab as leading space
-
-  "--" $comment* ;
-
-  @name        { toki TokSym }
-  @string      { \pos len inp -> return $! L pos (TokStr (B.take (len - 2) (B.tail inp))) }
-  @oplike      { toki TokOther }
-  $paren       { toki TokOther }
-  \:           { tok  Colon }
-  \{           { tok  OpenBrace }
-  \}           { tok  CloseBrace }
-  @nl          { \_ _ _ -> adjustPos retPos >> setStartCode bol_section >> lexToken }
+  --TODO: don't allow tab as leading space
+  $spacetab+     { toki Whitespace }
+
+  "--" $comment* { toki Comment }
+
+  @name          { toki TokSym }
+  @string        { \pos len inp -> return $! L pos (TokStr (B.take (len - 2) (B.tail inp))) }
+  @oplike        { toki TokOther }
+  $paren         { toki TokOther }
+  \:             { tok  Colon }
+  \{             { tok  OpenBrace }
+  \}             { tok  CloseBrace }
+  @nl            { \pos len inp -> do
+                      adjustPos retPos
+                      setStartCode bol_section
+                      lexToken }
 }
 
 <bol_field_layout> {
-  @nbspspacetab* { \pos len inp -> checkLeadingWhitespace pos len inp >>= \len' ->
-                                  if B.length inp == len
-                                    then return (L pos EOF)
-                                    else do
-                                      -- Small hack: if char and byte length mismatch
-                                      -- subtract the difference, so lexToken will count position correctly.
-                                      -- Proper (and slower) fix is to count utf8 length in lexToken
-                                      when (len' /= len) $ adjustPos (incPos (len' - len))
-                                      setStartCode in_field_layout
-                                      return (L pos (Indent len')) }
+  @nbspspacetab* { \pos len inp -> do
+                      len' <- checkLeadingWhitespace pos len inp
+                      if B.length inp == len
+                        then return (L pos EOF)
+                        else do
+                          -- Small hack: if char and byte length mismatch
+                          -- subtract the difference, so lexToken will count position correctly.
+                          -- Proper (and slower) fix is to count utf8 length in lexToken
+                          when (len' /= len) $ adjustPos (incPos (len' - len))
+                          setStartCode in_field_layout
+                          return (L pos (Indent len')) }
 }
 
 <in_field_layout> {
-  $spacetab+;
-  $field_layout' $field_layout*  { toki TokFieldLine }
-  @nl             { \_ _ _ -> adjustPos retPos >> setStartCode bol_field_layout >> lexToken }
+  $spacetab+                       { toki Whitespace }
+  $field_layout' $field_layout*    { toki TokFieldLine }
+  @nl                              { \pos len inp -> do
+                                        adjustPos retPos
+                                        setStartCode bol_field_layout
+                                        lexToken }
 }
 
 <bol_field_braces> {
-   ()                { \_ _ _ -> setStartCode in_field_braces >> lexToken }
+   ()                              { \_ _ _ -> setStartCode in_field_braces >> lexToken }
 }
 
 <in_field_braces> {
-  $spacetab+;
+  $spacetab+                       { toki Whitespace }
   $field_braces' $field_braces*    { toki TokFieldLine }
-  \{                { tok  OpenBrace  }
-  \}                { tok  CloseBrace }
-  @nl               { \_ _ _ -> adjustPos retPos >> setStartCode bol_field_braces >> lexToken }
+  \{                               { tok  OpenBrace  }
+  \}                               { tok  CloseBrace }
+  @nl                              { \pos len inp -> do
+                                        adjustPos retPos
+                                        setStartCode bol_field_braces
+                                        lexToken }
 }
 
 {
 
 -- | Tokens of outer cabal file structure. Field values are treated opaquely.
-data Token = TokSym   !ByteString       -- ^ Haskell-like identifier, number or operator
-           | TokStr   !ByteString       -- ^ String in quotes
-           | TokOther !ByteString       -- ^ Operators and parens
-           | Indent   !Int              -- ^ Indentation token
+data Token = TokSym       !ByteString   -- ^ Haskell-like identifier, number or operator
+           | TokStr       !ByteString   -- ^ String in quotes
+           | TokOther     !ByteString   -- ^ Operators and parens
+           | Indent       !Int          -- ^ Indentation token
            | TokFieldLine !ByteString   -- ^ Lines after @:@
            | Colon
            | OpenBrace
            | CloseBrace
+           | Whitespace   !ByteString
+           | Comment      !ByteString
            | EOF
            | LexicalError InputStream --TODO: add separate string lexical error
   deriving Show
@@ -230,7 +247,6 @@ lexToken = do
         setInput inp'
         let !len_bytes = B.length inp - B.length inp'
         t <- action pos len_bytes inp
-        --traceShow t $ return tok
         return t
 
 
@@ -259,11 +275,29 @@ lexAll = do
     _       -> do ts <- lexAll
                   return (t : ts)
 
+-- FIXME: for debugging
+lexAll' :: Lex [(Int, LToken)]
+lexAll' = do
+  t <- lexToken
+  c <- getStartCode
+  case t of
+    L _ EOF -> return [(c, t)]
+    _       -> do ts <- lexAll'
+                  return ((c, t) : ts)
+
 ltest :: Int -> String -> Prelude.IO ()
 ltest code s =
   let (ws, xs) = execLexer (setStartCode code >> lexAll) (B.Char8.pack s)
    in traverse_ print ws >> traverse_ print xs
 
+lexString :: String -> ([LexWarning], [LToken])
+lexString = execLexer lexAll . B.Char8.pack
+
+lexByteString :: ByteString -> ([LexWarning], [LToken])
+lexByteString = execLexer lexAll
+
+lexByteString' :: ByteString -> ([LexWarning], [(Int, LToken)])
+lexByteString' = execLexer lexAll'
 
 mkLexState :: ByteString -> LexState
 mkLexState input = LexState