TuringLang · sunxd3 · Aug 7, 2023 · Jul 14, 2023 · Jul 15, 2023 · Jul 17, 2023
diff --git a/docs/src/parser.md b/docs/src/parser.md
@@ -0,0 +1,182 @@
+Strictly speaking, the program is not "parsing", because the program doesn't output a syntax tree. 
+What is program does is take a token stream, with recursive descent structure, check the correctness of the program. 
+In the process of the recursive descent, BUGS syntax tokens will be translated into Julia syntax tokens. 
+The tokens that are already compatible with Julia will be remained, others will be either transformed or removed, also additional tokens may also be added.
+
+The parser will error given a program not in strict BUGS syntax.
+
+the general idea is:
+1. use `tokenize` to get the token vector
+2. inspect tokens and build the Julia version of the program in the form of a vector of tokens
+3. when it is appropriate to do so, just push the token to the Julia version of the program vector
+4. at the same time, some errors are detected and diagnostics are pushed to the diagnostics vector; also some tokens may be deleted, combined, or replaced 
+
+
+## Some Notes on error recovery
+The current error recovery is ad hoc and mostly primitive.
+The parser is written in a way that if the program is correct, then the program will descent into the correct function, thus produce correct result. If the program is not correct, the wavefront of the token stream will not be pushed forward, thus it will fail. 
+One of failure detection mechanism is to check if two error occurs with the same "current token". If there are, then the parser will stop and report the error. This is reassuring in the sense that the parser will not parse wrong programs. 
+(Maybe instead of giving up, we can enter the recovery(panic) mode, find a rendezvous(synchronization) point, and continue parsing. This is not that straightforward because the sometimes we are in a deep call stack. The most straight forward way to implement using exception handling
+```julia
+struct ParseException <: Exception end
+
+function parse_expression(parser::Parser)
+    # parsing code here
+    # If an error occurs, throw ParseException()
+end
+
+function parse_statement(parser::Parser)
+    # parsing code here
+    # If an error occurs, throw ParseException()
+end
+
+# ... more parsing functions ...
+
+function parse_program(parser::Parser)
+    while !eof(parser)
+        try
+            parse_statement(parser)
+        catch e
+            if isa(e, ParseException)
+                # Handle the error and recover
+                # This might involve advancing to a synchronization point in the input
+            else
+                rethrow(e)  # If it's not a ParseException, re-throw it
+            end
+        end
+    end
+end
+```
+credit ChatGPT for the example code.)
+
+Panic mode is a last resort. Ideally, we want try to sync up the parsing state and just continue parsing. Some thoughts:
+* Singular error
+    * Misspell
+        * The misspelt word is parsed into a single token
+            * The right thing to do is just record the diagnostic and discard and continue parsing
+            * Reliable detection is difficult, especially misspelt word can be parsed into multiple tokens, so by simply checking the next token is not enough, but might be a good enough heuristic
+        * The misspelt word is parsed into multiple tokens
+            * The right thing in this case is to skip all the tokens birthed by the misspelt word, and continue parsing
+            * This might not be that bad, because we whitelist the tokens that can be parsed into multiple tokens
+    * Missing
+        * the right thing to do is just simply continue without consuming or discarding
+    * Extra
+        * the right thing is to discard until the start of the next state
+* Consecutive errors
+    * This can be very tricky, because the detection is very difficult in the worst case. If we have a state syncing function for every state, and the lookahead length is 2, then we can match the "next token" and enter the next state and then try to sync again in that state.
+
+Cases we can handle by matching
+
+The difficulty comes from the fact that the tokenizer is not built for BUGS
+
+Some starting code
+```julia
+function sync_state(ps::ProcessState, current_token::String, next_token::Tuple)
+    if peek_raw(ps) == current_token && peek_raw(ps, 2) in next_token
+        return nothing
+    elseif peek_raw(ps) == current_token ## peek_raw(ps, 2) != next_token
+        # start an error diagnostic
+        # then we seek till one of the next_token is found
+        # if not found anything in the given budget, add to the diagnostic
+        # add a special place_holder and move on
+    elseif peek_raw(ps, 2) == next_token # peek_raw(ps) != current_token
+    else
+    end
+end
+
+function skip_until(ps::ProcessState, t::String, depth_limit=5)
+    seek_index = ps.current_index
+    while untokenize(ps.token_vec[seek_index], ps.text) != t
+        seek_index += 1
+        if seek_index > length(ps.token_vec)
+            return length(ps.token_vec)
+        end
+        if seek_index - ps.current_index > depth_limit
+            return nothing # give up, indicate that the token is probably missing
+        end
+    end
+    return seek_index # maybe actual location in the text 
+end
+```
+
+More prototype code on a later try
+```julia
+function discard_until!(ps::ProcessState, targets::Vector{String})
+    discarded_program_piece = ""
+    text_pos_pre = peek(ps).range.start
+    while peek_raw(ps) ∉ targets
+        discarded_program_piece *= peek_raw(ps)
+        discard!(ps)
+    end
+    text_pos_post = peek(ps).range.start - 1
+    return (text_pos_pre, text_pos_post), discarded_program_piece
+end
+
+struct ParseException <: Exception end
+
+function process_toplevel!(ps::ProcessState)
+    expect_and_discard!(ps, "model")
+    expect!(ps, "{", "begin")
+    try # use exception to discard call stack
+        process_statements!(ps)
+    catch e
+        if e isa ParseException
+            try_recovery!(ps)
+            return nothing
+        else
+            rethrow(e)
+        end
+    end
+    if peek(ps) != K"}"
+        add_diagnostic!(
+            ps,
+            "Parsing finished without get to the end of the program. $(peek_raw(ps)) is not expected to lead an statement.",
+        )
+    end
+    expect!(ps, "}", "end")
+    return process_trivia!(ps)
+end
+
+# panic mode recovery
+function try_recovery!(ps)
+    # seek to the closest sync point and dispatch to the corresponding `process_` function
+
+    # sync points: for, ;, <, ~, {, } 
+    # `\n` is not good, because we allow multiline expressions as C does
+    (text_pos_pre, text_pos_post), discarded_program_piece = discard_until!(ps, ["for", ";", "<", "~", "{"])
+
+    try
+        if peek_raw(ps) == "for"
+        elseif peek_raw(ps) == ";"
+            consume!(ps)
+            process_statements!(ps)
+        elseif peek_raw(ps) in ("<", "~")
+            recovery_function(ps) = process_assignment!(ps)
+        elseif peek_raw(ps) == "{"
+            # TODO: this is for loop body
+        end
+        # finish the current statement and move on
+        # possibly throw exception while in a for loop
+        process_statements!(ps) 
+    catch e
+        if e isa ParseException
+            try_recovery!(ps)
+        else
+            rethrow(e)
+        end
+    end
+end
+```
+
+Notes on the try on implementing `panic mode`:
+* What I tried
+    * throw error when we detect that the `current_pos` is not moving forward, this is implemented implicitly in `add_diagnostic!` function -- when two diagnostics are added with the same `current_pos`, then we throw an error
+    * my plan was instead of throwing an error, we can enter the panic mode, and try to recover
+    * recovery in concept is also simple, we just need to skip tokens until a synchronization point is found, and then dispatch to the corresponding `process_` function
+    * the issue is rooted in the mutual recursive nature of the program, when we throw an exception, we are in a deep call stack, so reentry to the previous function requires some thinking
+
+
+    * **After some thinking** a monolithic recovery may not be the best idea, we should put try catch in
+        * `process_for`: wrapping `process_statements!` so we have a chance to return to the for loop to wrap it up
+        * `process_statements!`: wrap the while loop body
+    * point is the recovery requirements are different for different functions, so we should put the try catch in the functions themselves
diff --git a/src/JuliaBUGS.jl b/src/JuliaBUGS.jl
@@ -19,14 +19,15 @@ import Distributions: truncated
 import AbstractPPL: AbstractContext, evaluate!!
 import DynamicPPL: settrans!!
 
-export @bugsast, @bugsmodel_str
+export @bugs, parse
 export compile
 
 # user defined functions and distributions are not supported yet
 include("BUGSPrimitives/BUGSPrimitives.jl")
 using .BUGSPrimitives
 
 include("bugsast.jl")
+include("parser.jl")
 include("variable_types.jl")
 include("compiler_pass.jl")
 include("node_functions.jl")

diff --git a/src/bugsast.jl b/src/bugsast.jl
@@ -150,7 +150,7 @@ allowed syntax is used, and normalizes certain expressions.
 Used expression heads: `:~` for tilde calls, `:ref` for indexing, `:(:)` for ranges.  These are
 converted from `:call` variants.
 """
-macro bugsast(expr)
+macro bugs(expr)
     return Meta.quot(post_parsing_processing(warn_link_function(bugsast(expr, __source__))))
 end
 
@@ -210,6 +210,22 @@ function bugs_to_julia(s)
     return s
 end
 
+function parse(prog::String, replace_period=true, format_output=true)
+    ps = ProcessState(prog, replace_period)
+    process_toplevel!(ps)
+    if !isempty(ps.diagnostics)
+        io = IOBuffer()
+        JuliaSyntax.show_diagnostics(io, ps.diagnostics, ps.text)
+        error("Errors in the program: \n $(String(take!(io)))")
+    end
+    julia_program = to_julia_program(ps.julia_token_vec, ps.text)
+    format_output && (julia_program = format_text(julia_program))
+    # return println(julia_program)
+    expr = Meta.parse(julia_program)
+    return Meta.quot(post_parsing_processing(bugsast(expr, LineNumberNode(1, Symbol(@__FILE__)))))
+end
+
+# during the transition phase, this macro is kept, but for internal use only
 macro bugsmodel_str(s::String)
     # Convert and wrap the whole thing in a block for parsing
     transformed_code = "begin\n$(bugs_to_julia(s))\nend"

diff --git a/src/graphs.jl b/src/graphs.jl
@@ -111,9 +111,14 @@ end
 # `_eval` mimic `eval` function, but use precompiled functions. This is possible because BUGS essentially only has
 # two kinds of expressions: function calls and indexing.
 # `env` is a dictionary mapping symbols in `expr` to values, values can be arrays or scalars
+function _eval(expr::Number, env)
+    return expr
+end
 function _eval(expr::Symbol, env)
     if expr == :nothing
         return nothing
+    elseif expr == :(:)
+        return Colon()
     else # intentional strict, all corner cases should be handled above
         return env[expr]
     end
@@ -137,7 +142,7 @@ function _eval(expr::Expr, env)
     end
 end
 function _eval(expr, env)
-    return expr
+    return error("Unknown expression type: $expr of type $(typeof(expr))")
 end
 
 """
@@ -153,6 +158,10 @@ struct BUGSModel <: AbstractPPL.AbstractProbabilisticProgram
     sorted_nodes::Vector{VarName}
 end
 
+# TODO: because all the (useful) data are already plugged into the expressions
+# (i.e., the `node_function_expr` are embedded with all the data), we can lean
+# down the variable store and only contains observational data, logical variable values, 
+# and model parameters
 function BUGSModel(g, sorted_nodes, vars, array_sizes, data, inits)
     vs = initialize_var_store(data, vars, array_sizes)
     vi = SimpleVarInfo(vs)

diff --git a/src/node_functions.jl b/src/node_functions.jl
@@ -287,6 +287,7 @@ function assignment!(pass::NodeFunctions, expr::Expr, env)
         rhs_expr = replace_constants_in_expr(rhs_expr, env)
         evaled_rhs, dependencies, node_args = evaluate_and_track_dependencies(rhs_expr, env)
 
+        # TODO: since we are not evaluating the node function expressions anymore, we don't have to store the expression like anonymous functions 
         # rhs can be evaluated into a concrete value here, because including transformed variables in the data
         # is effectively constant propagation
         if is_resolved(evaled_rhs)