From b039ef9c2b97bf416c0837b062ea2d7c935ef6c5 Mon Sep 17 00:00:00 2001 From: Kangjing Huang Date: Sun, 10 Dec 2023 02:11:04 -0500 Subject: [PATCH 1/9] Remove copyright notices as requested --- source/src/BNFC/Backend/TreeSitter.hs | 2 -- source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs | 2 -- source/src/BNFC/Backend/TreeSitter/RegToJSReg.hs | 2 -- 3 files changed, 6 deletions(-) diff --git a/source/src/BNFC/Backend/TreeSitter.hs b/source/src/BNFC/Backend/TreeSitter.hs index 885124c3..2849c39f 100644 --- a/source/src/BNFC/Backend/TreeSitter.hs +++ b/source/src/BNFC/Backend/TreeSitter.hs @@ -1,7 +1,5 @@ {- BNF Converter: TreeSitter Grammar Generator - Copyright (C) 2004 Author: Markus Forsberg, Michael Pellauer, - Bjorn Bringert Description : This module generates the grammar.js input file for tree-sitter. diff --git a/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs b/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs index eae9b17b..05279324 100644 --- a/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs +++ b/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs @@ -1,7 +1,5 @@ {- BNF Converter: TreeSitter Grammar Generator - Copyright (C) 2004 Author: Markus Forsberg, Michael Pellauer, - Bjorn Bringert Description : This module converts BNFC grammar to the contents of a tree-sitter grammar.js file diff --git a/source/src/BNFC/Backend/TreeSitter/RegToJSReg.hs b/source/src/BNFC/Backend/TreeSitter/RegToJSReg.hs index 92ca550d..0f233c50 100644 --- a/source/src/BNFC/Backend/TreeSitter/RegToJSReg.hs +++ b/source/src/BNFC/Backend/TreeSitter/RegToJSReg.hs @@ -1,7 +1,5 @@ {- BNF Converter: TreeSitter Grammar Generator - Copyright (C) 2004 Author: Markus Forsberg, Michael Pellauer, - Bjorn Bringert Description : This module converts BNFC Reg to Javascript regular expressions that is used in From e46218c34f45b956ad2d2a5b915a7e7750f028ae Mon Sep 17 00:00:00 2001 From: Kangjing Huang Date: Sun, 10 Dec 2023 02:44:33 -0500 Subject: [PATCH 2/9] Testing for tree-sitter backend --- testing/src/ParameterizedTests.hs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/testing/src/ParameterizedTests.hs b/testing/src/ParameterizedTests.hs index 16215fee..446f1e93 100644 --- a/testing/src/ParameterizedTests.hs +++ b/testing/src/ParameterizedTests.hs @@ -419,6 +419,7 @@ parameters = concat , javaParams { tpName = "Java (with jflex and line numbers)" , tpBnfcOptions = ["--java", "--jflex", "-l"] } ] + , [ treeSitter ] ] where base = baseParameters @@ -442,6 +443,14 @@ parameters = concat , tpBnfcOptions = ["--ocaml"] , tpRunTestProg = haskellRunTestProg } + treeSitter = TP + { tpName = "tree-sitter" + , tpBuild = do + cmd "tree-sitter" "generate" . (:[]) =<< findFile "grammar.js" + , tpBnfcOptions = ["--tree-sitter"] + , tpRunTestProg = \ _lang args -> do + cmd "tree-sitter" "parse" args + } -- | Helper function that runs bnfc with the context's options and an -- option to generate 'tpMakefile'. From 98b8d03334384167665e6ea6984c61793a4b08b2 Mon Sep 17 00:00:00 2001 From: Kangjing Huang Date: Tue, 9 Jan 2024 20:29:34 -0500 Subject: [PATCH 3/9] Fix escape bug; Use optional in tree-sitter --- .../BNFC/Backend/TreeSitter/CFtoTreeSitter.hs | 79 ++++++++++++------- .../src/BNFC/Backend/TreeSitter/RegToJSReg.hs | 2 +- 2 files changed, 51 insertions(+), 30 deletions(-) diff --git a/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs b/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs index 05279324..aaf0aea1 100644 --- a/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs +++ b/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs @@ -17,6 +17,7 @@ import BNFC.CF import BNFC.Lexing (mkRegMultilineComment) import BNFC.PrettyPrint import Prelude hiding ((<>)) +import Data.Maybe (catMaybes, isNothing, fromMaybe) -- | Indent one level of 2 spaces indent :: Doc -> Doc @@ -78,23 +79,13 @@ prExtras cf = -- into this list. This will require integration of a regex engine. prWord :: CF -> Doc prWord cf = - if wordNeeded - then - defineSymbol "word" - $+$ indent - ( wrapChoice - ( usrTokensFormatted - ++ [text "$.token_Ident" | identUsed] - ) - ) - <> "," - else empty + maybe empty (\word_list -> defineSymbol "word" $+$ indent word_list <> ",") $ + wrapChoiceStrict $ map Just $ usrTokensFormatted ++ [text "$.token_Ident" | identUsed] where - wordNeeded = identUsed || usrTokens /= [] identUsed = isUsedCat cf (TokenCat catIdent) usrTokens = tokenPragmas cf usrTokensFormatted = - map (text . refName . formatCatName False . TokenCat . fst) $ usrTokens + map (text . refName . formatCatName False . TokenCat . fst) usrTokens -- | Print builtin token rules according to their usage prBuiltinTokenRules :: CF -> Doc @@ -152,6 +143,16 @@ prUsrTokenRules cf = vcat' $ map prOneToken tokens hasInternal :: [Rule] -> Bool hasInternal = not . all isParsable +-- Tree Sitter does not support empty strings well enough +-- (Ref: https://github.com/tree-sitter/tree-sitter/issues/98), thus we need to +-- handle empty strings differently using the optional keyword +-- Rules with only an empty string as RHS is not supported by tree-sitter, but if +-- RHS choices contains one option of empty string, we remove it and wrap entire +-- RHS in optional() +-- e.g. choice(seq(), "literal", seq($.tokenA, $.ruleB)) +-- => optional("literal", seq($.tokenA, $.ruleB)) +type RhsItem = Maybe Doc + -- | Generates one or two tree-sitter rule(s) for one non-terminal from CF. -- Uses choice function from tree-sitter to combine rules for the non-terminal -- If the non-terminal has internal rules, an internal version of the non-terminal @@ -168,9 +169,10 @@ prOneCat rules nt = if int then defineSymbol (formatCatName True nt) $+$ indent (appendComma intRhs) else empty - parRhs = wrapChoice $ transChoice ++ genChoice (filter isParsable rules) - transChoice = [text $ refName $ formatCatName True nt | int] - intRhs = wrapChoice $ genChoice (filter (not . isParsable) rules) + parRhs = unwrapRhsItem $ wrapChoiceOptional $ transChoice ++ genChoice (filter isParsable rules) + transChoice = [Just $ text $ refName $ formatCatName True nt | int] + intRhs = unwrapRhsItem $ wrapChoiceOptional $ genChoice (filter (not . isParsable) rules) + unwrapRhsItem = fromMaybe (error "Tree sitter does not allow RHS of a rule to be one empty string only") genChoice = map (wrapSeq . formatRhs . rhsRule) -- | Generate one tree-sitter rule for one defined token @@ -195,19 +197,31 @@ commaJoin newline = | isEmpty b = a | otherwise = (if newline then ($+$) else (<>)) (a <> ",") b -wrapSeq :: [Doc] -> Doc -wrapSeq = wrapOptListFun "seq" False +-- Empty strings in a sequence can just be dropped and ignored +wrapSeq :: [RhsItem] -> RhsItem +wrapSeq = wrapOptListFun "seq" False . catMaybes -wrapChoice :: [Doc] -> Doc -wrapChoice = wrapOptListFun "choice" True +-- Strictly forbids empty strings +-- If any of the choice is empty string, returning empty +wrapChoiceStrict :: [RhsItem] -> RhsItem +wrapChoiceStrict items = wrapOptListFun "choice" True =<< sequence items + +-- Use optional keyword to handle empty strings +-- If empty string is present, all else is wrapped in optional +wrapChoiceOptional :: [RhsItem] -> RhsItem +wrapChoiceOptional items = if hasEmpty + then wrapped >>= \w -> Just $ text "optional" <> text "(" <> w <> text ")" + else wrapped + where + hasEmpty = any isNothing items + wrapped = wrapOptListFun "choice" True $ catMaybes items -- | Wrap list using tree-sitter fun if the list contains multiple items -- Returns the only item without wrapping otherwise -wrapOptListFun :: String -> Bool -> [Doc] -> Doc -wrapOptListFun fun newline list = - if length list == 1 - then head list - else wrapFun fun newline (commaJoin newline list) +wrapOptListFun :: String -> Bool -> [Doc] -> RhsItem +wrapOptListFun _ _ [] = Nothing +wrapOptListFun _ _ [oneItem] = Just oneItem +wrapOptListFun fun newline list = Just $ wrapFun fun newline (commaJoin newline list) wrapFun :: String -> Bool -> Doc -> Doc wrapFun fun newline arg = joinOp [text fun <> text "(", indent arg, text ")"] @@ -219,14 +233,21 @@ refName :: String -> String refName = ("$." ++) -- | Format right hand side into list of strings -formatRhs :: SentForm -> [Doc] +formatRhs :: SentForm -> [RhsItem] formatRhs = map (\case - Left c -> text $ refName $ formatCatName False c - Right term -> quoted term) + Left c -> Just$ text $ refName $ formatCatName False c + Right "" -> Nothing + Right term -> Just $ quoted term) + +stringLiteralReserved:: String +stringLiteralReserved = "\"\\" + +escapeStringLiteral:: String -> String +escapeStringLiteral = concatMap $ escapeCharFrom stringLiteralReserved quoted :: String -> Doc -quoted s = text "\"" <> text s <> text "\"" +quoted s = text "\"" <> text (escapeStringLiteral s) <> text "\"" -- | Format string for cat name, prefix "_" if the name is for internal rules formatCatName :: Bool -> Cat -> String diff --git a/source/src/BNFC/Backend/TreeSitter/RegToJSReg.hs b/source/src/BNFC/Backend/TreeSitter/RegToJSReg.hs index 0f233c50..61c113de 100644 --- a/source/src/BNFC/Backend/TreeSitter/RegToJSReg.hs +++ b/source/src/BNFC/Backend/TreeSitter/RegToJSReg.hs @@ -10,7 +10,7 @@ -} {-# LANGUAGE LambdaCase #-} -module BNFC.Backend.TreeSitter.RegToJSReg (printRegJSReg) where +module BNFC.Backend.TreeSitter.RegToJSReg (printRegJSReg, escapeCharFrom) where import BNFC.Abs From 8ce56c89e2802859b9286541f9b4687bff4397f2 Mon Sep 17 00:00:00 2001 From: Kangjing Huang Date: Thu, 25 Jan 2024 20:19:28 -0500 Subject: [PATCH 4/9] Preliminary fix for tree-sitter empty match issues --- .../BNFC/Backend/TreeSitter/CFtoTreeSitter.hs | 182 ++++++++++++------ 1 file changed, 124 insertions(+), 58 deletions(-) diff --git a/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs b/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs index aaf0aea1..f2cfb39d 100644 --- a/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs +++ b/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs @@ -8,16 +8,21 @@ Created : 08 Nov, 2023 -} +{-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE FlexibleInstances #-} module BNFC.Backend.TreeSitter.CFtoTreeSitter where import BNFC.Abs (Reg (RSeq, RSeqs, RStar, RAny)) import BNFC.Backend.TreeSitter.RegToJSReg + ( escapeCharFrom, printRegJSReg ) import BNFC.CF import BNFC.Lexing (mkRegMultilineComment) import BNFC.PrettyPrint + import Prelude hiding ((<>)) -import Data.Maybe (catMaybes, isNothing, fromMaybe) +import qualified Data.Map as Map +import Data.Map (Map) -- | Indent one level of 2 spaces indent :: Doc -> Doc @@ -79,9 +84,19 @@ prExtras cf = -- into this list. This will require integration of a regex engine. prWord :: CF -> Doc prWord cf = - maybe empty (\word_list -> defineSymbol "word" $+$ indent word_list <> ",") $ - wrapChoiceStrict $ map Just $ usrTokensFormatted ++ [text "$.token_Ident" | identUsed] + if wordNeeded + then + defineSymbol "word" + $+$ indent + ( wrapChoice + ( usrTokensFormatted + ++ [text "$.token_Ident" | identUsed] + ) + ) + <> "," + else empty where + wordNeeded = identUsed || usrTokens /= [] identUsed = isUsedCat cf (TokenCat catIdent) usrTokens = tokenPragmas cf usrTokensFormatted = @@ -109,6 +124,87 @@ stringRule = identRule = defineSymbol "token_Ident" <+> text "/[a-zA-Z][a-zA-Z\\d_']*/" <> "," +-- Tree Sitter does not support empty strings well enough +-- (Ref: https://github.com/tree-sitter/tree-sitter/issues/98), thus we need to +-- handle empty strings differently using the optional keyword +-- Crucially, any named symbols in Tree Sitter cannot have a zero width match, +-- unless it is the start symbol +-- Thus, we look for all categories that match zero width, tag them as "optional" +-- And when that cat is referred to, we use `optional($.cat_name)` in place of +-- `$.cat_name` to circumvent this issue + +-- | Cat with a tag indicating if it is optional +data OCat a = Always a | Optional a +type Cat' = OCat Cat + + -- | unwrap to original Cat +unwrap:: Cat' -> Cat +unwrap (Always c) = c +unwrap (Optional c) = c + +-- | Rule with RHS tagged +data Rule' = Rule' {srcRule::Rule, taggedRhs::SentForm'} +type SentForm' = [Either Cat' String] + +-- | type class for wrapped or unwrapped rules +class GetRule a where + getRule:: a -> Rule + +instance GetRule Rule where + getRule = id + +instance GetRule Rule' where + getRule = srcRule + +-- | Format right hand side of a rule into list of Docs +formatRuleRhs:: Rule' -> [Doc] +formatRuleRhs r = + map (\case + Left (Always c) -> text $ refName $ formatCatName False c + Left (Optional c) -> wrapFun "optional" False $ + text (refName $ formatCatName False c) + Right term -> quoted term) $ taggedRhs r + +-- | Analyzes the grammar with the entrance symbol. This function +-- groups all remaining categories with their rules including internal rules, +-- with optional flags determined and tagged for all returning Cats and rules +-- Returns (list of remaining tagged categories and tagged rules, tagged entrance rules) +analyzeCF :: CF -> Cat -> ([(Cat', [Rule'])], [Rule']) +analyzeCF cf entryCat = + ( + -- Empty rules are excluded from normal categories since they are handled by + -- "optional()" keywords in tree-sitter + [(wrapCat c, + map wrapRule $ filter (not . ruleIsEmpty) $ rulesForCat' cf c) + | c <- allCats, c /= entryCat], + -- Tree-sitter should support zero-width matches with root (i.e. entrance) symbol + -- thus no need to filter them out + map wrapRule $ rulesForCat' cf entryCat + ) + where + allCats = reallyAllCats cf + -- Stores mapping from Cat to optional flag + -- Currently we only recognize optional rules if any RHS of the rules is empty + -- list, and ignore more complex cases. + -- Complex optional cases may trigger tree-sitter to fail or bug out. + catOptMap = Map.fromList $ + map (\c -> (c, any ruleIsEmpty (rulesForCat' cf c))) + -- Always format entrance symbols as non-optional since + -- tree-sitter should support zero-width matches on them + $ filter (/= entryCat) allCats + -- Tags Cat to Cat' using catOptMap + wrapCat:: Cat -> Cat' + wrapCat c = if Map.findWithDefault False c catOptMap + then Optional c + else Always c + wrapRule r = Rule' {srcRule = r, + taggedRhs = map wrapSentFormItem $ rhsRule r + } + wrapSentFormItem :: Either Cat String -> Either Cat' String + wrapSentFormItem (Left c) = Left $ wrapCat c + wrapSentFormItem (Right s) = Right s + ruleIsEmpty = null . rhsRule . getRule + -- | First print the entrypoint rule, tree-sitter always use the -- first rule as entrypoint and does not support multi-entrypoint. -- Then print rest of the rules @@ -116,8 +212,10 @@ prRules :: CF -> Doc prRules cf = if onlyOneEntry then + -- entry rules are formatted without optional + -- tree-sitter should support zero-width (a.k.a empty) matches for top level symbols prOneCat entryRules entryCat - $+$ prOtherRules entryCat cf + $+$ prOtherRules otherCatRules else error "Tree-sitter only supports one entrypoint" where --If entrypoint is defined, there must be only one entrypoint @@ -125,14 +223,13 @@ prRules cf = onlyOneEntry = not (hasEntryPoint cf) || onlyOneEntryDefined onlyOneEntryDefined = length (allEntryPoints cf) == 1 entryCat = firstEntry cf - entryRules = rulesForCat' cf entryCat + (otherCatRules, entryRules) = analyzeCF cf entryCat -- | Print all other rules except the entrypoint -prOtherRules :: Cat -> CF -> Doc -prOtherRules entryCat cf = vcat' $ map mkOne rules +prOtherRules :: [(Cat', [Rule'])] -> Doc +prOtherRules otherRules = vcat' $ map mkOne otherRules where - rules = [(c, r) | (c, r) <- ruleGroupsInternals cf, c /= entryCat] - mkOne (cat, rules) = prOneCat rules cat + mkOne (cat, rules) = prOneCat rules $ unwrap cat prUsrTokenRules :: CF -> Doc prUsrTokenRules cf = vcat' $ map prOneToken tokens @@ -140,27 +237,17 @@ prUsrTokenRules cf = vcat' $ map prOneToken tokens tokens = tokenPragmas cf -- | Check if a set of rules contains internal rules -hasInternal :: [Rule] -> Bool -hasInternal = not . all isParsable - --- Tree Sitter does not support empty strings well enough --- (Ref: https://github.com/tree-sitter/tree-sitter/issues/98), thus we need to --- handle empty strings differently using the optional keyword --- Rules with only an empty string as RHS is not supported by tree-sitter, but if --- RHS choices contains one option of empty string, we remove it and wrap entire --- RHS in optional() --- e.g. choice(seq(), "literal", seq($.tokenA, $.ruleB)) --- => optional("literal", seq($.tokenA, $.ruleB)) -type RhsItem = Maybe Doc +hasInternal :: (GetRule a) => [a] -> Bool +hasInternal = not . all (isParsable . getRule) -- | Generates one or two tree-sitter rule(s) for one non-terminal from CF. -- Uses choice function from tree-sitter to combine rules for the non-terminal -- If the non-terminal has internal rules, an internal version of the non-terminal -- will be created (prefixed with "_" in tree-sitter), and all internal rules will -- be sectioned as such. -prOneCat :: [Rule] -> NonTerminal -> Doc +prOneCat :: [Rule'] -> NonTerminal -> Doc prOneCat rules nt = - defineSymbol (formatCatName False nt) + defineSymbol (formatCatName False $ nt) $+$ indent (appendComma parRhs) $+$ internalRules where @@ -169,11 +256,10 @@ prOneCat rules nt = if int then defineSymbol (formatCatName True nt) $+$ indent (appendComma intRhs) else empty - parRhs = unwrapRhsItem $ wrapChoiceOptional $ transChoice ++ genChoice (filter isParsable rules) - transChoice = [Just $ text $ refName $ formatCatName True nt | int] - intRhs = unwrapRhsItem $ wrapChoiceOptional $ genChoice (filter (not . isParsable) rules) - unwrapRhsItem = fromMaybe (error "Tree sitter does not allow RHS of a rule to be one empty string only") - genChoice = map (wrapSeq . formatRhs . rhsRule) + parRhs = wrapChoice $ transChoice ++ genChoice (filter (isParsable . getRule) rules) + transChoice = [text $ refName $ formatCatName True nt | int] + intRhs = wrapChoice $ genChoice (filter (not . isParsable. getRule) rules) + genChoice = map (wrapSeq . formatRuleRhs) -- | Generate one tree-sitter rule for one defined token prOneToken :: (TokenCat, Reg) -> Doc @@ -197,31 +283,19 @@ commaJoin newline = | isEmpty b = a | otherwise = (if newline then ($+$) else (<>)) (a <> ",") b --- Empty strings in a sequence can just be dropped and ignored -wrapSeq :: [RhsItem] -> RhsItem -wrapSeq = wrapOptListFun "seq" False . catMaybes - --- Strictly forbids empty strings --- If any of the choice is empty string, returning empty -wrapChoiceStrict :: [RhsItem] -> RhsItem -wrapChoiceStrict items = wrapOptListFun "choice" True =<< sequence items - --- Use optional keyword to handle empty strings --- If empty string is present, all else is wrapped in optional -wrapChoiceOptional :: [RhsItem] -> RhsItem -wrapChoiceOptional items = if hasEmpty - then wrapped >>= \w -> Just $ text "optional" <> text "(" <> w <> text ")" - else wrapped - where - hasEmpty = any isNothing items - wrapped = wrapOptListFun "choice" True $ catMaybes items +wrapSeq :: [Doc] -> Doc +wrapSeq = wrapOptListFun "seq" False + +wrapChoice :: [Doc] -> Doc +wrapChoice = wrapOptListFun "choice" True -- | Wrap list using tree-sitter fun if the list contains multiple items -- Returns the only item without wrapping otherwise -wrapOptListFun :: String -> Bool -> [Doc] -> RhsItem -wrapOptListFun _ _ [] = Nothing -wrapOptListFun _ _ [oneItem] = Just oneItem -wrapOptListFun fun newline list = Just $ wrapFun fun newline (commaJoin newline list) +wrapOptListFun :: String -> Bool -> [Doc] -> Doc +wrapOptListFun fun newline list = + if length list == 1 + then head list + else wrapFun fun newline (commaJoin newline list) wrapFun :: String -> Bool -> Doc -> Doc wrapFun fun newline arg = joinOp [text fun <> text "(", indent arg, text ")"] @@ -232,14 +306,6 @@ wrapFun fun newline arg = joinOp [text fun <> text "(", indent arg, text ")"] refName :: String -> String refName = ("$." ++) --- | Format right hand side into list of strings -formatRhs :: SentForm -> [RhsItem] -formatRhs = - map (\case - Left c -> Just$ text $ refName $ formatCatName False c - Right "" -> Nothing - Right term -> Just $ quoted term) - stringLiteralReserved:: String stringLiteralReserved = "\"\\" From c2ca38839139e20114fdb3134f4eaf8e98c54ff5 Mon Sep 17 00:00:00 2001 From: Kangjing Huang Date: Fri, 2 Feb 2024 16:16:10 -0500 Subject: [PATCH 5/9] Cleanup warning --- source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs b/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs index f2cfb39d..005d2c80 100644 --- a/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs +++ b/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs @@ -22,7 +22,7 @@ import BNFC.PrettyPrint import Prelude hiding ((<>)) import qualified Data.Map as Map -import Data.Map (Map) +import Data.Map () -- | Indent one level of 2 spaces indent :: Doc -> Doc From c2f67b30e3a029ef00bcaffe08937564b7a89de2 Mon Sep 17 00:00:00 2001 From: Kangjing Huang Date: Fri, 2 Feb 2024 18:38:44 -0500 Subject: [PATCH 6/9] Make zero width match removal optional in code --- source/src/BNFC/Backend/TreeSitter.hs | 6 +- .../BNFC/Backend/TreeSitter/CFtoTreeSitter.hs | 109 ++++++++++++------ 2 files changed, 75 insertions(+), 40 deletions(-) diff --git a/source/src/BNFC/Backend/TreeSitter.hs b/source/src/BNFC/Backend/TreeSitter.hs index 2849c39f..ba52dade 100644 --- a/source/src/BNFC/Backend/TreeSitter.hs +++ b/source/src/BNFC/Backend/TreeSitter.hs @@ -20,11 +20,13 @@ import BNFC.PrettyPrint -- | Entry point: create grammar.js file makeTreeSitter :: SharedOptions -> CF -> Backend makeTreeSitter opts cf = do - mkfile "grammar.js" comment (render $ cfToTreeSitter name cf) + -- Always remove zero width match for now, if needed, can be changed + -- to remove on flag in the future + mkfile "grammar.js" comment (render $ cfToTreeSitter name cf True) where name = lang opts comment :: String -> String comment = ("// " ++) --- | TODO: Add Makefile generation for tree-sitter +-- TODO: Add Makefile generation for tree-sitter diff --git a/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs b/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs index 005d2c80..080ffdaf 100644 --- a/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs +++ b/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs @@ -29,8 +29,11 @@ indent :: Doc -> Doc indent = nest 2 -- | Create content of grammar.js file -cfToTreeSitter :: String -> CF -> Doc -cfToTreeSitter name cf = +cfToTreeSitter :: String -- ^ Name of the language + -> CF -- ^ Context-Free grammar of the language + -> Bool -- ^ Flags to enable zero-width match elimination + -> Doc -- ^ grammar.js file generated +cfToTreeSitter name cf removeZero = -- Overall structure of grammar.js text "module.exports = grammar({" $+$ indent @@ -46,7 +49,7 @@ cfToTreeSitter name cf = rulesSection = text "rules: {" $+$ indent - ( prRules cf + ( prRules cf removeZero $+$ prUsrTokenRules cf $+$ prBuiltinTokenRules cf ) @@ -134,43 +137,67 @@ identRule = -- `$.cat_name` to circumvent this issue -- | Cat with a tag indicating if it is optional -data OCat a = Always a | Optional a -type Cat' = OCat Cat +data CatOpt' a = Always a | Optional a +type CatOpt = CatOpt' Cat - -- | unwrap to original Cat -unwrap:: Cat' -> Cat -unwrap (Always c) = c -unwrap (Optional c) = c +-- | type class for OCat or Cat +class UnwrapCat a where + unwrap:: a -> Cat + +instance UnwrapCat Cat where + unwrap = id + +instance UnwrapCat CatOpt where + -- | unwrap to original Cat + unwrap (Always c) = c + unwrap (Optional c) = c -- | Rule with RHS tagged -data Rule' = Rule' {srcRule::Rule, taggedRhs::SentForm'} -type SentForm' = [Either Cat' String] +data RuleOpt = Rule' {srcRule::Rule, taggedRhs::SentFormOpt} +type SentFormOpt = [Either CatOpt String] --- | type class for wrapped or unwrapped rules -class GetRule a where +-- | type class for a rule data type that can be formatted +class FormatRule a where + -- | get the original rule getRule:: a -> Rule + -- | format the RHS of the rule + formatRuleRhs:: a -> [Doc] -instance GetRule Rule where +instance FormatRule Rule where getRule = id -instance GetRule Rule' where + formatRuleRhs r = + map (\case + Left c -> text $ refName $ formatCatName False c + Right term -> quoted term) $ rhsRule r + + +instance FormatRule RuleOpt where getRule = srcRule --- | Format right hand side of a rule into list of Docs -formatRuleRhs:: Rule' -> [Doc] -formatRuleRhs r = - map (\case - Left (Always c) -> text $ refName $ formatCatName False c - Left (Optional c) -> wrapFun "optional" False $ - text (refName $ formatCatName False c) - Right term -> quoted term) $ taggedRhs r - --- | Analyzes the grammar with the entrance symbol. This function --- groups all remaining categories with their rules including internal rules, --- with optional flags determined and tagged for all returning Cats and rules + formatRuleRhs r = + map (\case + Left (Always c) -> text $ refName $ formatCatName False c + Left (Optional c) -> wrapFun "optional" False $ + text (refName $ formatCatName False c) + Right term -> quoted term) $ taggedRhs r + +-- | Analyzes the grammar with the entrance symbol. +-- This function finds all rules for the entrance symbol, and groups +-- all remaining categories with their rules, including internal rules. +analyzeCF :: CF -- ^ Context-free grammar of the language + -> Cat -- ^ Category object for the entrance symbol + -> ([(Cat, [Rule])], [Rule]) -- ^ (groups of remaining categories and rules, entrance rules) +analyzeCF _ _ = ([], []) + +-- | Analyzes the grammar with the entrance symbol. +-- This version of analyze function performs zero-width match analysis on all symbols and +-- returns with optional flags determined and tagged for all returning Cats and rules. -- Returns (list of remaining tagged categories and tagged rules, tagged entrance rules) -analyzeCF :: CF -> Cat -> ([(Cat', [Rule'])], [Rule']) -analyzeCF cf entryCat = +analyzeCFOptional :: CF -- ^ Context-free grammar of the language + -> Cat -- ^ Category object of the entrance symbol + -> ([(CatOpt, [RuleOpt])], [RuleOpt]) -- ^ (groups of tagged remaining categories and tagged rules, tagged entrance rules) +analyzeCFOptional cf entryCat = ( -- Empty rules are excluded from normal categories since they are handled by -- "optional()" keywords in tree-sitter @@ -193,14 +220,14 @@ analyzeCF cf entryCat = -- tree-sitter should support zero-width matches on them $ filter (/= entryCat) allCats -- Tags Cat to Cat' using catOptMap - wrapCat:: Cat -> Cat' + wrapCat:: Cat -> CatOpt wrapCat c = if Map.findWithDefault False c catOptMap then Optional c else Always c wrapRule r = Rule' {srcRule = r, taggedRhs = map wrapSentFormItem $ rhsRule r } - wrapSentFormItem :: Either Cat String -> Either Cat' String + wrapSentFormItem :: Either Cat String -> Either CatOpt String wrapSentFormItem (Left c) = Left $ wrapCat c wrapSentFormItem (Right s) = Right s ruleIsEmpty = null . rhsRule . getRule @@ -208,14 +235,19 @@ analyzeCF cf entryCat = -- | First print the entrypoint rule, tree-sitter always use the -- first rule as entrypoint and does not support multi-entrypoint. -- Then print rest of the rules -prRules :: CF -> Doc -prRules cf = +prRules :: CF -> Bool -> Doc +prRules cf removeZero = if onlyOneEntry then -- entry rules are formatted without optional -- tree-sitter should support zero-width (a.k.a empty) matches for top level symbols - prOneCat entryRules entryCat - $+$ prOtherRules otherCatRules + if removeZero + then + prOneCat entryRulesOpt entryCat + $+$ prOtherRules otherCatRulesOpt + else + prOneCat entryRules entryCat + $+$ prOtherRules otherCatRules else error "Tree-sitter only supports one entrypoint" where --If entrypoint is defined, there must be only one entrypoint @@ -223,10 +255,11 @@ prRules cf = onlyOneEntry = not (hasEntryPoint cf) || onlyOneEntryDefined onlyOneEntryDefined = length (allEntryPoints cf) == 1 entryCat = firstEntry cf + (otherCatRulesOpt, entryRulesOpt) = analyzeCFOptional cf entryCat (otherCatRules, entryRules) = analyzeCF cf entryCat -- | Print all other rules except the entrypoint -prOtherRules :: [(Cat', [Rule'])] -> Doc +prOtherRules :: (UnwrapCat a, FormatRule b) => [(a, [b])] -> Doc prOtherRules otherRules = vcat' $ map mkOne otherRules where mkOne (cat, rules) = prOneCat rules $ unwrap cat @@ -237,7 +270,7 @@ prUsrTokenRules cf = vcat' $ map prOneToken tokens tokens = tokenPragmas cf -- | Check if a set of rules contains internal rules -hasInternal :: (GetRule a) => [a] -> Bool +hasInternal :: (FormatRule a) => [a] -> Bool hasInternal = not . all (isParsable . getRule) -- | Generates one or two tree-sitter rule(s) for one non-terminal from CF. @@ -245,7 +278,7 @@ hasInternal = not . all (isParsable . getRule) -- If the non-terminal has internal rules, an internal version of the non-terminal -- will be created (prefixed with "_" in tree-sitter), and all internal rules will -- be sectioned as such. -prOneCat :: [Rule'] -> NonTerminal -> Doc +prOneCat :: FormatRule a => [a] -> NonTerminal -> Doc prOneCat rules nt = defineSymbol (formatCatName False $ nt) $+$ indent (appendComma parRhs) From ff24228a7d7f3d6d0907eb92d50c97a0186ddd03 Mon Sep 17 00:00:00 2001 From: Kangjing Huang Date: Fri, 2 Feb 2024 19:01:27 -0500 Subject: [PATCH 7/9] Missing analyze function for disabled zero removal --- source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs b/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs index 080ffdaf..7f6da2bd 100644 --- a/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs +++ b/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs @@ -188,7 +188,9 @@ instance FormatRule RuleOpt where analyzeCF :: CF -- ^ Context-free grammar of the language -> Cat -- ^ Category object for the entrance symbol -> ([(Cat, [Rule])], [Rule]) -- ^ (groups of remaining categories and rules, entrance rules) -analyzeCF _ _ = ([], []) +analyzeCF cf entryCat = ([(c, rulesForCat' cf c)| c <- allCats, c /= entryCat], + rulesForCat' cf entryCat) + where allCats = reallyAllCats cf -- | Analyzes the grammar with the entrance symbol. -- This version of analyze function performs zero-width match analysis on all symbols and From 3667a0601e471cf84b021fb16acdef15b291ee63 Mon Sep 17 00:00:00 2001 From: Kangjing Huang Date: Sat, 3 Feb 2024 23:35:39 -0500 Subject: [PATCH 8/9] Support older GHC versions --- source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs b/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs index 7f6da2bd..8032bcde 100644 --- a/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs +++ b/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs @@ -149,8 +149,9 @@ instance UnwrapCat Cat where instance UnwrapCat CatOpt where -- | unwrap to original Cat - unwrap (Always c) = c - unwrap (Optional c) = c + unwrap c = case c of + Always ct -> ct + Optional ct -> ct -- | Rule with RHS tagged data RuleOpt = Rule' {srcRule::Rule, taggedRhs::SentFormOpt} From 39fb1ec28c1a3e939ec3dbcc203547419065550f Mon Sep 17 00:00:00 2001 From: Kangjing Huang Date: Sat, 3 Feb 2024 23:43:33 -0500 Subject: [PATCH 9/9] Fix parse error on older GHC --- source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs b/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs index 8032bcde..8b5d3fff 100644 --- a/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs +++ b/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs @@ -142,16 +142,15 @@ type CatOpt = CatOpt' Cat -- | type class for OCat or Cat class UnwrapCat a where + -- | unwrap to original Cat unwrap:: a -> Cat instance UnwrapCat Cat where unwrap = id instance UnwrapCat CatOpt where - -- | unwrap to original Cat - unwrap c = case c of - Always ct -> ct - Optional ct -> ct + unwrap (Always c) = c + unwrap (Optional c) = c -- | Rule with RHS tagged data RuleOpt = Rule' {srcRule::Rule, taggedRhs::SentFormOpt}