Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Custom JSON parsing #115

Merged
merged 22 commits into from
May 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ARCHITECTURE.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ The ICU message parser is a [recursive descent parser](https://en.wikipedia.org/

For example, given an ICU message `hello <bold>{name}</bold>`, we’d first try parsing for an interpolation or tag, and failing that would parse plaintext, which we’d do until we encountered a reason to stop, in this case the tag opening character. We’ve stored "hello " as plaintext and will now continue along, this time succeeding in parsing the tag. We’ll now recursively parse inside the bounds of the tag, reusing the same top-level parser we were just using, this time parsing an interpolation. Having done this we’ve _consumed_ the entire input string and have successfully parsed a recursive list of nodes making up our [AST](https://en.wikipedia.org/wiki/Abstract_syntax_tree).

JSON parsing is offloaded to [aeson](https://hackage.haskell.org/package/aeson).
JSON parsing is handled internally for better interop with the ICU parser. JSON encoding is offloaded to [aeson](https://hackage.haskell.org/package/aeson).

### Compilation

Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ Translation files should be encoded as JSON and might look something like this:
{
"welcome": {
"message": "Hello {name}",
"description": "Welcome message",
"backend": "ts"
}
}
Expand Down
12 changes: 6 additions & 6 deletions cli/Main.hs
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
module Main where

import CLI (Opts (..), getOpts)
import qualified Data.Text as T
import Intlc.Compiler (compileDataset, compileFlattened)
import CLI (Opts (..), getOpts)
import qualified Data.Text as T
import Intlc.Compiler (compileDataset, compileFlattened)
import Intlc.Core
import Intlc.Parser (ParseFailure, parseDataset, printErr)
import Intlc.Parser (parseDataset, printErr)
import Intlc.Parser.Error (ParseFailure)
import Prelude

main :: IO ()
Expand All @@ -16,5 +17,4 @@ main = getOpts >>= \case
compilerDie = die . T.unpack . ("Invalid keys:\n" <>) . T.intercalate "\n" . fmap ("\t" <>) . toList

getParsed :: FilePath -> IO (Either ParseFailure (Dataset Translation))
getParsed = fmap parseDataset . readFileLBS

getParsed x = parseDataset x <$> readFileText x
6 changes: 5 additions & 1 deletion intlc.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ library
Intlc.Core
Intlc.ICU
Intlc.Parser
Intlc.Parser.Error
Intlc.Parser.JSON
Intlc.Parser.ICU
Utils

test-suite test-intlc
Expand All @@ -78,4 +81,5 @@ test-suite test-intlc
Intlc.Backend.TypeScriptSpec
Intlc.CompilerSpec
Intlc.EndToEndSpec
Intlc.ParserSpec
Intlc.Parser.JSONSpec
Intlc.Parser.ICUSpec
18 changes: 1 addition & 17 deletions lib/Intlc/Core.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,8 @@

module Intlc.Core where

import Data.Aeson (FromJSON (..), ToJSON (toEncoding),
withObject, withText, (.!=), (.:), (.:?),
(.=))
import Data.Aeson (ToJSON (toEncoding), (.=))
import Data.Aeson.Encoding (pairs, string)
import qualified Data.Text as T
import Intlc.ICU (Message)
import Prelude

Expand All @@ -22,12 +19,6 @@ data Backend
| TypeScriptReact
deriving (Show, Eq, Generic)

instance FromJSON Backend where
parseJSON = withText "Backend" decode
where decode "ts" = pure TypeScript
decode "tsx" = pure TypeScriptReact
decode x = fail $ "Unknown backend: " <> T.unpack x

instance ToJSON Backend where
toEncoding TypeScript = string "ts"
toEncoding TypeScriptReact = string "tsx"
Expand All @@ -39,13 +30,6 @@ data UnparsedTranslation = UnparsedTranslation
}
deriving (Show, Eq, Generic)

instance FromJSON UnparsedTranslation where
parseJSON = withObject "UnparsedTranslation" decode
where decode x = UnparsedTranslation
<$> x .: "message"
<*> x .:? "backend" .!= TypeScript
<*> x .:? "description"

instance ToJSON UnparsedTranslation where
toEncoding (UnparsedTranslation msg be md) = pairs $
"message" .= msg
Expand Down
241 changes: 8 additions & 233 deletions lib/Intlc/Parser.hs
Original file line number Diff line number Diff line change
@@ -1,239 +1,14 @@
-- This module follows the following whitespace rules:
-- * Consume all whitespace after tokens where possible.
-- * Therefore, assume no whitespace before tokens.

{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# HLINT ignore "Use newtype instead of data" #-}

module Intlc.Parser where

import qualified Control.Applicative.Combinators.NonEmpty as NE
import Data.Aeson (decode)
import Data.ByteString.Lazy (ByteString)
import qualified Data.Map as M
import qualified Data.Text as T
import Data.Validation (toEither,
validationNel)
import Data.Void ()
import Intlc.Core
import Intlc.ICU
import Prelude hiding (ByteString)
import Text.Megaparsec hiding (State, Stream,
Token, many, some,
token)
import Text.Megaparsec.Char
import qualified Text.Megaparsec.Char.Lexer as L
import Text.Megaparsec.Error.Builder

type ParseErr = ParseErrorBundle Text MessageParseErr

data ParseFailure
= FailedJsonParse
| FailedDatasetParse (NonEmpty ParseErr)
deriving (Show, Eq)
import Intlc.Parser.Error (ParseFailure)
import Intlc.Parser.JSON (ParserState (ParserState), dataset)
import Prelude
import Text.Megaparsec (runParser)
import Text.Megaparsec.Error

data MessageParseErr
= NoClosingCallbackTag Text
| BadClosingCallbackTag Text Text
deriving (Show, Eq, Ord)

instance ShowErrorComponent MessageParseErr where
showErrorComponent (NoClosingCallbackTag x) = "Callback tag <" <> T.unpack x <> "> not closed"
showErrorComponent (BadClosingCallbackTag x y) = "Callback tag <" <> T.unpack x <> "> not closed, instead found </" <> T.unpack y <> ">"

failingWith :: MonadParsec e s m => Int -> e -> m a
pos `failingWith` e = parseError . errFancy pos . fancy . ErrorCustom $ e
parseDataset :: FilePath -> Text -> Either ParseFailure (Dataset Translation)
parseDataset = runParser (evalStateT dataset (ParserState mempty))

printErr :: ParseFailure -> String
printErr FailedJsonParse = "Failed to parse JSON"
printErr (FailedDatasetParse es) = intercalate "\n" . toList . fmap errorBundlePretty $ es

parseDataset :: ByteString -> Either ParseFailure (Dataset Translation)
parseDataset = parse' <=< decode'
where decode' = maybeToRight FailedJsonParse . decode
parse' = toEither . first FailedDatasetParse . M.traverseWithKey ((validationNel .) . parseTranslationFor)

parseTranslationFor :: Text -> UnparsedTranslation -> Either ParseErr Translation
parseTranslationFor name (UnparsedTranslation umsg be md) = do
msg' <- runParser (runReaderT msg initialState) (T.unpack name) umsg
pure $ Translation msg' be md

data ParserState = ParserState
{ pluralCtxName :: Maybe Text
}

initialState :: ParserState
initialState = ParserState mempty

type Parser = ReaderT ParserState (Parsec MessageParseErr Text)

ident :: Parser Text
ident = T.pack <$> some letterChar

msg :: Parser Message
msg = f . mergePlaintext <$> manyTill token eof
where f [] = Static ""
f [Plaintext x] = Static x
f (x:xs) = Dynamic (x :| xs)

token :: Parser Token
token = choice
[ Interpolation <$> (interp <|> callback)
-- Plural cases support interpolating the number/argument in context with
-- `#`. When there's no such context, fail the parse in effect treating it
-- as plaintext.
, asks pluralCtxName >>= \case
Just n -> Interpolation (Arg n PluralRef) <$ string "#"
Nothing -> empty
, Plaintext <$> (try escaped <|> plaintext)
]

plaintext :: Parser Text
plaintext = T.singleton <$> L.charLiteral

escaped :: Parser Text
escaped = apos *> choice
-- Double escape two apostrophes as one: "''" -> "'"
[ "'" <$ apos
-- Escape everything until another apostrophe, being careful of internal
-- double escapes: "'{a''}'" -> "{a'}"
, try $ T.pack <$> someTillNotDouble L.charLiteral apos
-- Escape the next syntax character as plaintext: "'{" -> "{"
, T.singleton <$> syn
]
where apos = char '\''
syn = char '{' <|> char '<'
-- Like `someTill`, but doesn't end upon encountering two `end` tokens,
-- instead consuming them as one and continuing.
someTillNotDouble p end = tryOne
where tryOne = (:) <$> p <*> go
go = ((:) <$> try (end <* end) <*> go) <|> (mempty <$ end) <|> tryOne

callback :: Parser Arg
callback = do
oname <- string "<" *> ident <* string ">"
mrest <- observing ((,,) <$> children <* string "</" <*> getOffset <*> ident <* string ">")
case mrest of
Left _ -> 1 `failingWith` NoClosingCallbackTag oname
Right (ch, pos, cname) -> if oname == cname
then pure (Arg oname ch)
else pos `failingWith` BadClosingCallbackTag oname cname
where children = Callback . mergePlaintext <$> manyTill token (lookAhead $ string "</")

interp :: Parser Arg
interp = do
n <- string "{" *> ident
Arg n <$> choice
[ String <$ string "}"
, sep *> body n <* string "}"
]
where sep = string "," <* hspace1
body n = choice
[ uncurry Bool <$> (string "boolean" *> sep *> boolCases)
, Number <$ string "number"
, Date <$> (string "date" *> sep *> dateTimeFmt)
, Time <$> (string "time" *> sep *> dateTimeFmt)
, Plural <$> withPluralCtx n (
string "plural" *> sep *> cardinalPluralCases
<|> string "selectordinal" *> sep *> ordinalPluralCases
)
, uncurry Select <$> (string "select" *> sep *> selectCases)
]
withPluralCtx n = withReaderT (const . ParserState . pure $ n)

dateTimeFmt :: Parser DateTimeFmt
dateTimeFmt = choice
[ Short <$ string "short"
, Medium <$ string "medium"
, Long <$ string "long"
, Full <$ string "full"
]

caseBody :: Parser Stream
caseBody = mergePlaintext <$> (string "{" *> manyTill token (string "}"))

boolCases :: Parser (Stream, Stream)
boolCases = (,)
<$> (string "true" *> hspace1 *> caseBody)
<* hspace1
<*> (string "false" *> hspace1 *> caseBody)

selectCases :: Parser (NonEmpty SelectCase, Maybe SelectWildcard)
selectCases = (,) <$> cases <*> optional wildcard
where cases = NE.sepEndBy1 (SelectCase <$> (name <* hspace1) <*> caseBody) hspace1
wildcard = SelectWildcard <$> (string wildcardName *> hspace1 *> caseBody)
name = try $ mfilter (/= wildcardName) ident
wildcardName = "other"

cardinalPluralCases :: Parser Plural
cardinalPluralCases = fmap Cardinal . tryClassify =<< p
where tryClassify = maybe empty pure . uncurry classifyCardinal
p = (,) <$> disorderedPluralCases <*> optional pluralWildcard

ordinalPluralCases :: Parser Plural
ordinalPluralCases = fmap Ordinal . tryClassify =<< p
where tryClassify = maybe empty pure . uncurry classifyOrdinal
p = (,) <$> disorderedPluralCases <*> pluralWildcard

-- Need to lift parsed plural cases into this type to make the list homogeneous.
data ParsedPluralCase
= ParsedExact (PluralCase PluralExact)
| ParsedRule (PluralCase PluralRule)

disorderedPluralCases :: Parser (NonEmpty ParsedPluralCase)
disorderedPluralCases = flip NE.sepEndBy1 hspace1 $ choice
[ (ParsedExact .) . PluralCase <$> pluralExact <* hspace1 <*> caseBody
, (ParsedRule .) . PluralCase <$> pluralRule <* hspace1 <*> caseBody
]

pluralExact :: Parser PluralExact
pluralExact = PluralExact . T.pack <$> (string "=" *> some numberChar)

pluralRule :: Parser PluralRule
pluralRule = choice
[ Zero <$ string "zero"
, One <$ string "one"
, Two <$ string "two"
, Few <$ string "few"
, Many <$ string "many"
]

pluralWildcard :: Parser PluralWildcard
pluralWildcard = PluralWildcard <$> (string "other" *> hspace1 *> caseBody)

-- | To simplify parsing cases we validate after-the-fact here. This achieves
-- two purposes. Firstly it enables us to fail the parse if the cases are not
-- exclusively literals and there's no wildcard (see below), and secondly it
-- allows us to organise the cases into the appropriate `Plural` constructors,
-- which in turn enables more efficient codegen later on.
--
-- =0 {} =1 {} -- Lit
-- =0 {} =1 {} other {} -- Lit
-- one {} two {} other {} -- Rule
-- =0 {} one {} other {} -- Mixed
--
classifyCardinal :: Foldable f => f ParsedPluralCase -> Maybe PluralWildcard -> Maybe CardinalPlural
classifyCardinal xs mw =
case (organisePluralCases xs, mw) of
((Just ls, Nothing), mw') -> Just (LitPlural ls mw')
((Nothing, Just rs), Just w) -> Just (RulePlural rs w)
((Just ls, Just rs), Just w) -> Just (MixedPlural ls rs w)
-- Rule plurals require a wildcard.
((_, Just _), Nothing) -> Nothing
-- We should have parsed and organised at least one case somewhere.
((Nothing, Nothing), _) -> Nothing

-- | This is simpler than its cardinal counterpart. Here we need only validate
-- that there is at least one rule case. This is performed here to simplify
-- supporting disordered cases in the parser (whereas validating the presence
-- of a wildcard at the end is trivial in the parser).
classifyOrdinal :: Foldable f => f ParsedPluralCase -> PluralWildcard -> Maybe OrdinalPlural
classifyOrdinal xs w =
case organisePluralCases xs of
(_, Nothing) -> Nothing
(mls, Just rs) -> Just $ OrdinalPlural (foldMap toList mls) rs w

organisePluralCases :: Foldable f => f ParsedPluralCase -> (Maybe (NonEmpty (PluralCase PluralExact)), Maybe (NonEmpty (PluralCase PluralRule)))
organisePluralCases = bimap nonEmpty nonEmpty . foldr f mempty
where f (ParsedExact x) = first (x:)
f (ParsedRule x) = second (x:)
printErr = errorBundlePretty
44 changes: 44 additions & 0 deletions lib/Intlc/Parser/Error.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
-- Our parsers are unavoidably tied together, and it's easiest if dependent
-- parsers share the same error type, but we also need to avoid cyclic
-- dependencies - so it all lives here.

{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# HLINT ignore "Use newtype instead of data" #-}

module Intlc.Parser.Error where

import qualified Data.Text as T
import Prelude
import Text.Megaparsec (MonadParsec (parseError))
import Text.Megaparsec.Error
import Text.Megaparsec.Error.Builder

type ParseFailure = ParseErrorBundle Text ParseErr

data ParseErr
= FailedJSONParse JSONParseErr
| FailedMsgParse MessageParseErr
deriving (Show, Eq, Ord)

data JSONParseErr
= DuplicateKey Text
deriving (Show, Eq, Ord)

data MessageParseErr
= NoClosingCallbackTag Text
| BadClosingCallbackTag Text Text
deriving (Show, Eq, Ord)

instance ShowErrorComponent ParseErr where
showErrorComponent (FailedJSONParse e) = showErrorComponent e
showErrorComponent (FailedMsgParse e) = showErrorComponent e

instance ShowErrorComponent JSONParseErr where
showErrorComponent (DuplicateKey k) = "Duplicate key: \"" <> T.unpack k <> "\""

instance ShowErrorComponent MessageParseErr where
showErrorComponent (NoClosingCallbackTag x) = "Callback tag <" <> T.unpack x <> "> not closed"
showErrorComponent (BadClosingCallbackTag x y) = "Callback tag <" <> T.unpack x <> "> not closed, instead found </" <> T.unpack y <> ">"

failingWith :: MonadParsec e s m => Int -> e -> m a
pos `failingWith` e = parseError . errFancy pos . fancy . ErrorCustom $ e
Loading