Skip to content

Commit

Permalink
Merge pull request #115 from unsplash/inhouse-json
Browse files Browse the repository at this point in the history
Custom JSON parsing
  • Loading branch information
samhh authored May 24, 2022
2 parents d118833 + 71dd8c2 commit 866e9b1
Show file tree
Hide file tree
Showing 13 changed files with 487 additions and 280 deletions.
2 changes: 1 addition & 1 deletion ARCHITECTURE.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ The ICU message parser is a [recursive descent parser](https://en.wikipedia.org/

For example, given an ICU message `hello <bold>{name}</bold>`, we’d first try parsing for an interpolation or tag, and failing that would parse plaintext, which we’d do until we encountered a reason to stop, in this case the tag opening character. We’ve stored "hello " as plaintext and will now continue along, this time succeeding in parsing the tag. We’ll now recursively parse inside the bounds of the tag, reusing the same top-level parser we were just using, this time parsing an interpolation. Having done this we’ve _consumed_ the entire input string and have successfully parsed a recursive list of nodes making up our [AST](https://en.wikipedia.org/wiki/Abstract_syntax_tree).

JSON parsing is offloaded to [aeson](https://hackage.haskell.org/package/aeson).
JSON parsing is handled internally for better interop with the ICU parser. JSON encoding is offloaded to [aeson](https://hackage.haskell.org/package/aeson).

### Compilation

Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ Translation files should be encoded as JSON and might look something like this:
{
"welcome": {
"message": "Hello {name}",
"description": "Welcome message",
"backend": "ts"
}
}
Expand Down
12 changes: 6 additions & 6 deletions cli/Main.hs
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
module Main where

import CLI (Opts (..), getOpts)
import qualified Data.Text as T
import Intlc.Compiler (compileDataset, compileFlattened)
import CLI (Opts (..), getOpts)
import qualified Data.Text as T
import Intlc.Compiler (compileDataset, compileFlattened)
import Intlc.Core
import Intlc.Parser (ParseFailure, parseDataset, printErr)
import Intlc.Parser (parseDataset, printErr)
import Intlc.Parser.Error (ParseFailure)
import Prelude

main :: IO ()
Expand All @@ -16,5 +17,4 @@ main = getOpts >>= \case
compilerDie = die . T.unpack . ("Invalid keys:\n" <>) . T.intercalate "\n" . fmap ("\t" <>) . toList

getParsed :: FilePath -> IO (Either ParseFailure (Dataset Translation))
getParsed = fmap parseDataset . readFileLBS

getParsed x = parseDataset x <$> readFileText x
6 changes: 5 additions & 1 deletion intlc.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ library
Intlc.Core
Intlc.ICU
Intlc.Parser
Intlc.Parser.Error
Intlc.Parser.JSON
Intlc.Parser.ICU
Utils

test-suite test-intlc
Expand All @@ -78,4 +81,5 @@ test-suite test-intlc
Intlc.Backend.TypeScriptSpec
Intlc.CompilerSpec
Intlc.EndToEndSpec
Intlc.ParserSpec
Intlc.Parser.JSONSpec
Intlc.Parser.ICUSpec
18 changes: 1 addition & 17 deletions lib/Intlc/Core.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,8 @@

module Intlc.Core where

import Data.Aeson (FromJSON (..), ToJSON (toEncoding),
withObject, withText, (.!=), (.:), (.:?),
(.=))
import Data.Aeson (ToJSON (toEncoding), (.=))
import Data.Aeson.Encoding (pairs, string)
import qualified Data.Text as T
import Intlc.ICU (Message)
import Prelude

Expand All @@ -22,12 +19,6 @@ data Backend
| TypeScriptReact
deriving (Show, Eq, Generic)

instance FromJSON Backend where
parseJSON = withText "Backend" decode
where decode "ts" = pure TypeScript
decode "tsx" = pure TypeScriptReact
decode x = fail $ "Unknown backend: " <> T.unpack x

instance ToJSON Backend where
toEncoding TypeScript = string "ts"
toEncoding TypeScriptReact = string "tsx"
Expand All @@ -39,13 +30,6 @@ data UnparsedTranslation = UnparsedTranslation
}
deriving (Show, Eq, Generic)

instance FromJSON UnparsedTranslation where
parseJSON = withObject "UnparsedTranslation" decode
where decode x = UnparsedTranslation
<$> x .: "message"
<*> x .:? "backend" .!= TypeScript
<*> x .:? "description"

instance ToJSON UnparsedTranslation where
toEncoding (UnparsedTranslation msg be md) = pairs $
"message" .= msg
Expand Down
241 changes: 8 additions & 233 deletions lib/Intlc/Parser.hs
Original file line number Diff line number Diff line change
@@ -1,239 +1,14 @@
-- This module follows the following whitespace rules:
-- * Consume all whitespace after tokens where possible.
-- * Therefore, assume no whitespace before tokens.

{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# HLINT ignore "Use newtype instead of data" #-}

module Intlc.Parser where

import qualified Control.Applicative.Combinators.NonEmpty as NE
import Data.Aeson (decode)
import Data.ByteString.Lazy (ByteString)
import qualified Data.Map as M
import qualified Data.Text as T
import Data.Validation (toEither,
validationNel)
import Data.Void ()
import Intlc.Core
import Intlc.ICU
import Prelude hiding (ByteString)
import Text.Megaparsec hiding (State, Stream,
Token, many, some,
token)
import Text.Megaparsec.Char
import qualified Text.Megaparsec.Char.Lexer as L
import Text.Megaparsec.Error.Builder

type ParseErr = ParseErrorBundle Text MessageParseErr

data ParseFailure
= FailedJsonParse
| FailedDatasetParse (NonEmpty ParseErr)
deriving (Show, Eq)
import Intlc.Parser.Error (ParseFailure)
import Intlc.Parser.JSON (ParserState (ParserState), dataset)
import Prelude
import Text.Megaparsec (runParser)
import Text.Megaparsec.Error

data MessageParseErr
= NoClosingCallbackTag Text
| BadClosingCallbackTag Text Text
deriving (Show, Eq, Ord)

instance ShowErrorComponent MessageParseErr where
showErrorComponent (NoClosingCallbackTag x) = "Callback tag <" <> T.unpack x <> "> not closed"
showErrorComponent (BadClosingCallbackTag x y) = "Callback tag <" <> T.unpack x <> "> not closed, instead found </" <> T.unpack y <> ">"

failingWith :: MonadParsec e s m => Int -> e -> m a
pos `failingWith` e = parseError . errFancy pos . fancy . ErrorCustom $ e
parseDataset :: FilePath -> Text -> Either ParseFailure (Dataset Translation)
parseDataset = runParser (evalStateT dataset (ParserState mempty))

printErr :: ParseFailure -> String
printErr FailedJsonParse = "Failed to parse JSON"
printErr (FailedDatasetParse es) = intercalate "\n" . toList . fmap errorBundlePretty $ es

parseDataset :: ByteString -> Either ParseFailure (Dataset Translation)
parseDataset = parse' <=< decode'
where decode' = maybeToRight FailedJsonParse . decode
parse' = toEither . first FailedDatasetParse . M.traverseWithKey ((validationNel .) . parseTranslationFor)

parseTranslationFor :: Text -> UnparsedTranslation -> Either ParseErr Translation
parseTranslationFor name (UnparsedTranslation umsg be md) = do
msg' <- runParser (runReaderT msg initialState) (T.unpack name) umsg
pure $ Translation msg' be md

data ParserState = ParserState
{ pluralCtxName :: Maybe Text
}

initialState :: ParserState
initialState = ParserState mempty

type Parser = ReaderT ParserState (Parsec MessageParseErr Text)

ident :: Parser Text
ident = T.pack <$> some letterChar

msg :: Parser Message
msg = f . mergePlaintext <$> manyTill token eof
where f [] = Static ""
f [Plaintext x] = Static x
f (x:xs) = Dynamic (x :| xs)

token :: Parser Token
token = choice
[ Interpolation <$> (interp <|> callback)
-- Plural cases support interpolating the number/argument in context with
-- `#`. When there's no such context, fail the parse in effect treating it
-- as plaintext.
, asks pluralCtxName >>= \case
Just n -> Interpolation (Arg n PluralRef) <$ string "#"
Nothing -> empty
, Plaintext <$> (try escaped <|> plaintext)
]

plaintext :: Parser Text
plaintext = T.singleton <$> L.charLiteral

escaped :: Parser Text
escaped = apos *> choice
-- Double escape two apostrophes as one: "''" -> "'"
[ "'" <$ apos
-- Escape everything until another apostrophe, being careful of internal
-- double escapes: "'{a''}'" -> "{a'}"
, try $ T.pack <$> someTillNotDouble L.charLiteral apos
-- Escape the next syntax character as plaintext: "'{" -> "{"
, T.singleton <$> syn
]
where apos = char '\''
syn = char '{' <|> char '<'
-- Like `someTill`, but doesn't end upon encountering two `end` tokens,
-- instead consuming them as one and continuing.
someTillNotDouble p end = tryOne
where tryOne = (:) <$> p <*> go
go = ((:) <$> try (end <* end) <*> go) <|> (mempty <$ end) <|> tryOne

callback :: Parser Arg
callback = do
oname <- string "<" *> ident <* string ">"
mrest <- observing ((,,) <$> children <* string "</" <*> getOffset <*> ident <* string ">")
case mrest of
Left _ -> 1 `failingWith` NoClosingCallbackTag oname
Right (ch, pos, cname) -> if oname == cname
then pure (Arg oname ch)
else pos `failingWith` BadClosingCallbackTag oname cname
where children = Callback . mergePlaintext <$> manyTill token (lookAhead $ string "</")

interp :: Parser Arg
interp = do
n <- string "{" *> ident
Arg n <$> choice
[ String <$ string "}"
, sep *> body n <* string "}"
]
where sep = string "," <* hspace1
body n = choice
[ uncurry Bool <$> (string "boolean" *> sep *> boolCases)
, Number <$ string "number"
, Date <$> (string "date" *> sep *> dateTimeFmt)
, Time <$> (string "time" *> sep *> dateTimeFmt)
, Plural <$> withPluralCtx n (
string "plural" *> sep *> cardinalPluralCases
<|> string "selectordinal" *> sep *> ordinalPluralCases
)
, uncurry Select <$> (string "select" *> sep *> selectCases)
]
withPluralCtx n = withReaderT (const . ParserState . pure $ n)

dateTimeFmt :: Parser DateTimeFmt
dateTimeFmt = choice
[ Short <$ string "short"
, Medium <$ string "medium"
, Long <$ string "long"
, Full <$ string "full"
]

caseBody :: Parser Stream
caseBody = mergePlaintext <$> (string "{" *> manyTill token (string "}"))

boolCases :: Parser (Stream, Stream)
boolCases = (,)
<$> (string "true" *> hspace1 *> caseBody)
<* hspace1
<*> (string "false" *> hspace1 *> caseBody)

selectCases :: Parser (NonEmpty SelectCase, Maybe SelectWildcard)
selectCases = (,) <$> cases <*> optional wildcard
where cases = NE.sepEndBy1 (SelectCase <$> (name <* hspace1) <*> caseBody) hspace1
wildcard = SelectWildcard <$> (string wildcardName *> hspace1 *> caseBody)
name = try $ mfilter (/= wildcardName) ident
wildcardName = "other"

cardinalPluralCases :: Parser Plural
cardinalPluralCases = fmap Cardinal . tryClassify =<< p
where tryClassify = maybe empty pure . uncurry classifyCardinal
p = (,) <$> disorderedPluralCases <*> optional pluralWildcard

ordinalPluralCases :: Parser Plural
ordinalPluralCases = fmap Ordinal . tryClassify =<< p
where tryClassify = maybe empty pure . uncurry classifyOrdinal
p = (,) <$> disorderedPluralCases <*> pluralWildcard

-- Need to lift parsed plural cases into this type to make the list homogeneous.
data ParsedPluralCase
= ParsedExact (PluralCase PluralExact)
| ParsedRule (PluralCase PluralRule)

disorderedPluralCases :: Parser (NonEmpty ParsedPluralCase)
disorderedPluralCases = flip NE.sepEndBy1 hspace1 $ choice
[ (ParsedExact .) . PluralCase <$> pluralExact <* hspace1 <*> caseBody
, (ParsedRule .) . PluralCase <$> pluralRule <* hspace1 <*> caseBody
]

pluralExact :: Parser PluralExact
pluralExact = PluralExact . T.pack <$> (string "=" *> some numberChar)

pluralRule :: Parser PluralRule
pluralRule = choice
[ Zero <$ string "zero"
, One <$ string "one"
, Two <$ string "two"
, Few <$ string "few"
, Many <$ string "many"
]

pluralWildcard :: Parser PluralWildcard
pluralWildcard = PluralWildcard <$> (string "other" *> hspace1 *> caseBody)

-- | To simplify parsing cases we validate after-the-fact here. This achieves
-- two purposes. Firstly it enables us to fail the parse if the cases are not
-- exclusively literals and there's no wildcard (see below), and secondly it
-- allows us to organise the cases into the appropriate `Plural` constructors,
-- which in turn enables more efficient codegen later on.
--
-- =0 {} =1 {} -- Lit
-- =0 {} =1 {} other {} -- Lit
-- one {} two {} other {} -- Rule
-- =0 {} one {} other {} -- Mixed
--
classifyCardinal :: Foldable f => f ParsedPluralCase -> Maybe PluralWildcard -> Maybe CardinalPlural
classifyCardinal xs mw =
case (organisePluralCases xs, mw) of
((Just ls, Nothing), mw') -> Just (LitPlural ls mw')
((Nothing, Just rs), Just w) -> Just (RulePlural rs w)
((Just ls, Just rs), Just w) -> Just (MixedPlural ls rs w)
-- Rule plurals require a wildcard.
((_, Just _), Nothing) -> Nothing
-- We should have parsed and organised at least one case somewhere.
((Nothing, Nothing), _) -> Nothing

-- | This is simpler than its cardinal counterpart. Here we need only validate
-- that there is at least one rule case. This is performed here to simplify
-- supporting disordered cases in the parser (whereas validating the presence
-- of a wildcard at the end is trivial in the parser).
classifyOrdinal :: Foldable f => f ParsedPluralCase -> PluralWildcard -> Maybe OrdinalPlural
classifyOrdinal xs w =
case organisePluralCases xs of
(_, Nothing) -> Nothing
(mls, Just rs) -> Just $ OrdinalPlural (foldMap toList mls) rs w

organisePluralCases :: Foldable f => f ParsedPluralCase -> (Maybe (NonEmpty (PluralCase PluralExact)), Maybe (NonEmpty (PluralCase PluralRule)))
organisePluralCases = bimap nonEmpty nonEmpty . foldr f mempty
where f (ParsedExact x) = first (x:)
f (ParsedRule x) = second (x:)
printErr = errorBundlePretty
44 changes: 44 additions & 0 deletions lib/Intlc/Parser/Error.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
-- Our parsers are unavoidably tied together, and it's easiest if dependent
-- parsers share the same error type, but we also need to avoid cyclic
-- dependencies - so it all lives here.

{-# OPTIONS_GHC -Wno-unrecognised-pragmas #-}
{-# HLINT ignore "Use newtype instead of data" #-}

module Intlc.Parser.Error where

import qualified Data.Text as T
import Prelude
import Text.Megaparsec (MonadParsec (parseError))
import Text.Megaparsec.Error
import Text.Megaparsec.Error.Builder

type ParseFailure = ParseErrorBundle Text ParseErr

data ParseErr
= FailedJSONParse JSONParseErr
| FailedMsgParse MessageParseErr
deriving (Show, Eq, Ord)

data JSONParseErr
= DuplicateKey Text
deriving (Show, Eq, Ord)

data MessageParseErr
= NoClosingCallbackTag Text
| BadClosingCallbackTag Text Text
deriving (Show, Eq, Ord)

instance ShowErrorComponent ParseErr where
showErrorComponent (FailedJSONParse e) = showErrorComponent e
showErrorComponent (FailedMsgParse e) = showErrorComponent e

instance ShowErrorComponent JSONParseErr where
showErrorComponent (DuplicateKey k) = "Duplicate key: \"" <> T.unpack k <> "\""

instance ShowErrorComponent MessageParseErr where
showErrorComponent (NoClosingCallbackTag x) = "Callback tag <" <> T.unpack x <> "> not closed"
showErrorComponent (BadClosingCallbackTag x y) = "Callback tag <" <> T.unpack x <> "> not closed, instead found </" <> T.unpack y <> ">"

failingWith :: MonadParsec e s m => Int -> e -> m a
pos `failingWith` e = parseError . errFancy pos . fancy . ErrorCustom $ e
Loading

0 comments on commit 866e9b1

Please sign in to comment.