Skip to content

Commit

Permalink
Merge pull request #1864 from jplag/feature/refactorRLang
Browse files Browse the repository at this point in the history
Refactored RLang language module to new framework
  • Loading branch information
tsaglam authored Oct 30, 2024
2 parents 69cc90d + a84ac1e commit 264a159
Show file tree
Hide file tree
Showing 8 changed files with 374 additions and 766 deletions.
5 changes: 5 additions & 0 deletions languages/rlang/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@
<groupId>org.antlr</groupId>
<artifactId>antlr4-runtime</artifactId>
</dependency>
<dependency>
<groupId>de.jplag</groupId>
<artifactId>language-antlr-utils</artifactId>
<version>${revision}</version>
</dependency>
</dependencies>

<build>
Expand Down
304 changes: 177 additions & 127 deletions languages/rlang/src/main/antlr4/de/jplag/rlang/grammar/R.g4
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
[The "BSD licence"]
Copyright (c) 2013 Terence Parr
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
Expand All @@ -12,6 +13,7 @@
documentation and/or other materials provided with the distribution.
3. The name of the author may not be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
Expand All @@ -27,28 +29,27 @@
/**
derived from http://svn.r-project.org/R/trunk/src/main/gram.y
http://cran.r-project.org/doc/manuals/R-lang.html#Parser
I'm no R genius but this seems to work.
Requires RFilter.g4 to strip away NL that are really whitespace,
not end-of-command. See TestR.java
Usage:
$ antlr4 R.g4 RFilter.g4
$ javac *.java
$ java TestR sample.R
... prints parse tree ...
*/

/*
Modified version of the original in https://github.com/antlr/grammars-v4/blob/master/r/R.g4 so that I can separate the most relevant tokens of R in
the JplagRListenter.java file.
Author of the modification: Antonio Javier Rodriguez Perez
*/
// $antlr-format alignTrailingComments true, columnLimit 150, minEmptyLines 1, maxEmptyLinesToKeep 1, reflowComments false, useTab false
// $antlr-format allowShortRulesOnASingleLine false, allowShortBlocksOnASingleLine true, alignSemicolons hanging, alignColons hanging

grammar R;

prog: ( expr (';'|NL)
| NL
)*
EOF
prog
: ((SEMICOLON | NL)+ | expr )* EOF
;

/*
Expand All @@ -58,159 +59,208 @@ expr_or_assign
;
*/

expr: expr index_statement // '[[' follows R's yacc grammar
| expr access_package expr
| expr ('$'|'@') expr
| <assoc=right> expr '^' expr
| ('-'|'+') expr
| expr ':' expr
| expr USER_OP expr // anything wrappedin %: '%' .* '%'
| expr ('*'|'/') expr
| expr ('+'|'-') expr
| expr ('>'|'>='|'<'|'<='|'=='|'!=') expr
| '!' expr
| expr ('&'|'&&') expr
| expr ('|'|'||') expr
| '~' expr
| expr '~' expr
| expr assign_value expr
| function_definition // define function
| expr function_call // call function
| compound_statement
| if_statement
| for_statement
| while_statement
| repeat_statement
| help
| next_statement
| break_statement
| '(' expr ')'
| ID
| constant
expr
: expr LIST_ACCESS_START sublist LIST_ACCESS_END #ListAccess // '[[' follows R's yacc grammar
| expr ARRAY_ACCESS_START sublist ARRAY_ACCESS_END #ArrayAccess
| expr NAMESPACE_ACCESS expr #NamespaceAccess
| expr COMPONENT_ACCESS expr #ComponentAccess
| <assoc = right> expr '^' expr #Exponent
| ADD_SUB expr #Sign
| expr RANGE_OPERATOR expr #Range
| expr USER_OP expr #UserDefinedOperation // anything wrappedin %: '%' .* '%'
| expr MULT_DIV expr #MultOrDiv
| expr ADD_SUB expr #AddOrSub
| expr COMPARATOR expr #Comparison
| NOT expr #Not
| expr AND expr #And
| expr OR expr #Or
| '~' expr #ModelFormulaePrefix
| expr '~' expr #ModelFormulaeInfix
| expr (ASSIGN | EQUALS) expr #Assignment
| FUNCTION PAREN_L formlist? PAREN_R expr #FunctionDefinition // define function
| expr PAREN_L sublist PAREN_R #FunctionCall // call function
| CURLY_L exprlist CURLY_R #CompoundStatement // compound statement
| IF PAREN_L expr PAREN_R expr #If
| IF PAREN_L expr PAREN_R expr NL* ELSE expr #IfElse
| FOR PAREN_L ID IN expr PAREN_R expr #For
| WHILE PAREN_L expr PAREN_R expr #While
| REPEAT expr #Repeat
| HELP expr #Help // get help on expr, usually string or ID
| NEXT #Next
| BREAK #Break
| PAREN_L expr PAREN_R #BracketTerm
| ID #Id
| STRING #String
| HEX #Hex
| INT #Int
| FLOAT #Float
| COMPLEX #Complex
| NULL #Null
| NA #Na
| INF #Inf
| NAN #Nan
| TRUE #True
| FALSE #False
| NL+ expr #Newline
;

index_statement : '[[' sublist ']' ']' | '[' sublist ']' ;

access_package: '::'|':::' ;

function_definition: 'function' '(' formlist? ')' expr ;

function_call : '(' sublist ')' ;

constant: constant_number | constant_string | constant_bool | 'NULL' | 'NA' | 'Inf' | 'NaN' ;

constant_number: HEX | INT | FLOAT | COMPLEX ;

constant_string: STRING ;

constant_bool: 'TRUE' | 'FALSE' ;

help: '?' expr ; // get help on expr, usually string or ID

if_statement : 'if' '(' expr ')' expr | 'if' '(' expr ')' expr 'else' expr ;

for_statement : 'for' '(' ID 'in' expr ')' expr ;

while_statement : 'while' '(' expr ')' expr ;

repeat_statement: 'repeat' expr ;

next_statement: 'next' ;

break_statement: 'break' ;

compound_statement: '{' exprlist '}' ;

exprlist
: expr ((';'|NL) expr?)*
|
: expr ((SEMICOLON | NL) expr?)*
;

formlist : form (',' form)* ;
formlist
: form (',' form)*
;

form: ID
| assign_func_declaration
form
: ID
| ID EQUALS expr
| '...'
| '.'
;

sublist : sub (',' sub)* ;
sublist
: sub (',' sub)*
;

sub : expr
| assign_value_list
sub
: expr
| ID EQUALS
| ID EQUALS expr
| STRING EQUALS
| STRING EQUALS expr
| NULL EQUALS
| NULL EQUALS expr
| '...'
| '.'
|
;

assign_value: '<-'|'<<-'|'='|'->'|'->>'|':=';

assign_func_declaration: ID '=' expr | '...' ;

assign_value_list: ID '=' | ID '=' expr | constant_string '=' | constant_string '=' expr | 'NULL' '=' | 'NULL' '=' expr | '...' ;



HEX : '0' ('x'|'X') HEXDIGIT+ [Ll]? ;
IF: 'if';
FOR: 'for';
WHILE: 'while';
REPEAT: 'repeat';
FUNCTION: 'function';
ELSE: 'else';
IN: 'in';

LIST_ACCESS_START: '[[';
LIST_ACCESS_END: ']]';
ARRAY_ACCESS_START: '[';
ARRAY_ACCESS_END: ']';
NAMESPACE_ACCESS: ':::' | '::';
COMPONENT_ACCESS: '$' | '@';

HELP: '?';
NEXT: 'next';
BREAK: 'break';

NULL: 'NULL';
NA: 'NA';
INF: 'inf';
NAN: 'NaN';
TRUE: 'TRUE';
FALSE: 'FALSE';

NOT: '!';
RANGE_OPERATOR: ':';

MULT_DIV: '*' | '/';
ADD_SUB: '+' | '-';
COMPARATOR: '>' | '>=' | '<' | '<=' | '==' | '!=';
ASSIGN: '<-' | '<<-' | '->' | '->>' | ':=';
EQUALS: '=';
AND: '&&' | '&';
OR: '||' | '|';

PAREN_L: '(';
PAREN_R: ')';
CURLY_L: '{';
CURLY_R: '}';

HEX
: '0' ('x' | 'X') HEXDIGIT+ [Ll]?
;

INT : DIGIT+ [Ll]? ;
INT
: DIGIT+ [Ll]?
;

fragment
HEXDIGIT : ('0'..'9'|'a'..'f'|'A'..'F') ;
fragment HEXDIGIT
: ('0' ..'9' | 'a' ..'f' | 'A' ..'F')
;

FLOAT: DIGIT+ '.' DIGIT* EXP? [Ll]?
| DIGIT+ EXP? [Ll]?
| '.' DIGIT+ EXP? [Ll]?
FLOAT
: DIGIT+ '.' DIGIT* EXP? [Ll]?
| DIGIT+ EXP? [Ll]?
| '.' DIGIT+ EXP? [Ll]?
;

fragment
DIGIT: '0'..'9' ;
fragment DIGIT
: '0' ..'9'
;

fragment
EXP : ('E' | 'e') ('+' | '-')? INT ;
fragment EXP
: ('E' | 'e') ('+' | '-')? INT
;

COMPLEX
: INT 'i'
| FLOAT 'i'
: INT 'i'
| FLOAT 'i'
;

STRING
: '"' ( ESC | ~[\\"] )*? '"'
| '\'' ( ESC | ~[\\'] )*? '\''
| '`' ( ESC | ~[\\'] )*? '`'
: '"' (ESC | ~[\\"])*? '"'
| '\'' ( ESC | ~[\\'])*? '\''
| '`' ( ESC | ~[\\'])*? '`'
;
fragment
ESC : '\\' [abtnfrv"'\\]
| UNICODE_ESCAPE
| HEX_ESCAPE
| OCTAL_ESCAPE
fragment ESC
: '\\' [abtnfrv"'\\]
| UNICODE_ESCAPE
| HEX_ESCAPE
| OCTAL_ESCAPE
;

fragment
UNICODE_ESCAPE
: '\\' 'u' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT
| '\\' 'u' '{' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT '}'
fragment UNICODE_ESCAPE
: '\\' 'u' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT
| '\\' 'u' '{' HEXDIGIT HEXDIGIT HEXDIGIT HEXDIGIT '}'
;

fragment
OCTAL_ESCAPE
: '\\' [0-3] [0-7] [0-7]
| '\\' [0-7] [0-7]
| '\\' [0-7]
fragment OCTAL_ESCAPE
: '\\' [0-3] [0-7] [0-7]
| '\\' [0-7] [0-7]
| '\\' [0-7]
;

fragment
HEX_ESCAPE
: '\\' HEXDIGIT HEXDIGIT?
fragment HEX_ESCAPE
: '\\' HEXDIGIT HEXDIGIT?
;

ID : '.' (LETTER|'_'|'.') (LETTER|DIGIT|'_'|'.')*
| LETTER (LETTER|DIGIT|'_'|'.')*
ID
: '.' (LETTER | '_' | '.') (LETTER | DIGIT | '_' | '.')*
| LETTER (LETTER | DIGIT | '_' | '.')*
;

fragment LETTER : [a-zA-Z] ;

USER_OP : '%' .*? '%' ;
fragment LETTER
: [a-zA-Z]
;

COMMENT : '#' .*? '\r'? '\n' -> type(NL) ;
USER_OP
: '%' .*? '%'
;

COMMENT
: '#' .*? '\r'? '\n' -> type(NL)
;

// Match both UNIX and Windows newlines
NL : '\r'? '\n' ;
NL
: '\r'? '\n'
;

SEMICOLON: ';';

WS : [ \t\u000C]+ -> skip ;
WS
: [ \t\u000C]+ -> skip
;
Loading

0 comments on commit 264a159

Please sign in to comment.