Parse EBNF
Create a simple parser for EBNF grammars. Here is an ebnf grammar in itself and a parser for it in php.
- You can use regular expressions for lexing.
- Generate the calculator in Arithmetic evaluation using an EBNF description of the calculator.
Here is part of the simple parser rules for a calculator taken from the antlr tutorial
expr : term ( ( PLUS | MINUS ) term )* ; term : factor ( ( MULT | DIV ) factor )* ; factor : NUMBER ;
Modula-2
<lang Modula-2> MODULE EBNF;
FROM ASCII IMPORT EOL; FROM InOut IMPORT Done, Read, Write, WriteLn, WriteInt, WriteString; FROM EBNFScanner IMPORT Symbol, sym, id, Ino, GetSym, MarkError, SkipLine; FROM TableHandler IMPORT WordLength, Table, overflow, InitTable, Record, Tabulate;
VAR T0, T1 : Table;
PROCEDURE skip (n : INTEGER);
BEGIN
MarkError (n); WHILE (sym < lpar) OR (sym > period) DO GetSym END
END skip;
PROCEDURE Expression;
PROCEDURE Term;
PROCEDURE Factor;
BEGIN IF sym = ident THEN Record (T0, id, Ino); GetSym ELSIF sym = literal THEN Record (T1, id, Ino); GetSym ELSIF sym = lpar THEN GetSym; Expression; IF sym = rpar THEN GetSym ELSE skip (2) END ELSIF sym = lbk THEN GetSym; Expression; IF sym = rbk THEN GetSym ELSE skip (3) END ELSIF sym = lbr THEN GetSym; Expression; IF sym = rbr THEN GetSym ELSE skip (4) END ELSE skip (5) END END Factor;
BEGIN Factor; WHILE sym < bar DO Factor END END Term;
BEGIN
Term; WHILE sym = bar DO GetSym; Term END
END Expression;
PROCEDURE Production;
BEGIN
Record (T0, id, - INTEGER (Ino)); GetSym; IF sym = eql THEN GetSym ELSE skip (7) END; Expression; IF sym # period THEN MarkError (8); SkipLine END; GetSym
END Production;
BEGIN
InitTable (T0); InitTable (T1); GetSym; WHILE (sym = ident) AND (overflow = 0) DO Production END; IF overflow > 0 THEN WriteLn; WriteString ("Table overflow"); WriteInt (overflow, 6); END; Write (35C); Tabulate (T0); Tabulate (T1);
END EBNF.</lang> And the source for the EBNF scanner. I hope you like nested procedures. <lang Modula-2> IMPLEMENTATION MODULE EBNFScanner;
FROM ASCII IMPORT LF; FROM InOut IMPORT Read, Write, WriteLn, WriteInt, WriteBf, EOF;
VAR ch : CHAR;
MODULE LineHandler;
IMPORT LF, EOF, ch, Ino, Read, Write, WriteLn, WriteInt, WriteBf; EXPORT GetCh, MarkError, SkipLine;
CONST LineWidth = 100;
VAR cc : INTEGER; cc1 : INTEGER; cc2 : INTEGER; line : ARRAY [0..LineWidth - 1] OF CHAR;
PROCEDURE GetLine;
BEGIN IF cc2 > 0 THEN WriteLn; cc2 := 0 END; Read (ch); IF EOF () THEN line [0] := 177C; cc1 := 1 ELSE INC (Ino); WriteInt (Ino, 5); Write (' '); cc1 := 0; LOOP Write (ch); line [cc1] := ch; INC (cc1); IF ch = LF THEN EXIT END; Read (ch) END END END GetLine;
PROCEDURE GetCh;
BEGIN WHILE cc = cc1 DO cc := 0; GetLine END; ch := line [cc]; INC (cc) END GetCh;
PROCEDURE MarkError (n : INTEGER);
BEGIN IF cc2 = 0 THEN Write ('*'); cc2 := 3; REPEAT Write (' '); DEC (cc2) UNTIL cc2 = 0; END; WHILE cc2 < cc DO Write (' '); INC (cc2) END; Write ('^'); WriteInt (n, 1); INC (cc2, 2) END MarkError;
PROCEDURE SkipLine;
BEGIN WHILE ch # LF DO GetCh END; GetCh END SkipLine;
BEGIN (* BEGIN of LineHandler *) cc := 0; cc1 := 0; cc2 := 0 END LineHandler;
PROCEDURE GetSym;
VAR i : CARDINAL;
BEGIN
WHILE ch <= ' ' DO GetCh END; IF ch = '/' THEN SkipLine; WHILE ch <= ' ' DO GetCh END END; IF (CAP (ch) <= 'Z') AND (CAP (ch) >= 'A') THEN i := 0; sym := literal; REPEAT IF i < IdLength THEN id [i] := ch; INC (i) END; IF ch > 'Z' THEN sym := ident END; GetCh UNTIL (CAP (ch) < 'A') OR (CAP (ch) > 'Z'); id [i] := ' ' ELSIF ch = "'" THEN i := 0; GetCh; sym := literal; WHILE ch # "'" DO IF i < IdLength THEN id [i] := ch; INC (i) END; GetCh END; GetCh; id [i] := ' ' WHILE ch # "'" DO IF i < IdLength THEN id [i] := ch; INC (i) END; GetCh END; GetCh; id [i] := ' ' ELSIF ch = '"' THEN i := 0; GetCh; sym := literal; WHILE ch # '"' DO IF i < IdLength THEN id [i] := ch; INC (i) END; GetCh END; GetCh; id [i] := ' ' ELSIF ch = '=' THEN sym := eql; GetCh ELSIF ch = '(' THEN sym := lpar; GetCh ELSIF ch = ')' THEN sym := rpar; GetCh ELSIF ch = '[' THEN sym := lbk; GetCh ELSIF ch = ']' THEN sym := rbk; GetCh ELSIF ch = '{' THEN sym := lbr; GetCh ELSIF ch = '}' THEN sym := rbr; GetCh ELSIF ch = '|' THEN sym := bar; GetCh ELSIF ch = '.' THEN sym := period; GetCh ELSIF ch = 177C THEN sym := other; GetCh ELSE sym := other; GetCh END
END GetSym;
BEGIN
Ino := 0; ch := ' '
END EBNFScanner.</lang>
PicoLisp
<lang PicoLisp>(de EBNF
"expr : term ( ( PLUS | MINUS ) term )* ;" "term : factor ( ( MULT | DIV ) factor )* ;" "factor : NUMBER ;" )
(for E EBNF
(use (@S @E) (unless (and (match '(@S : @E ;) (str E)) (not (cdr @S))) (quit "Invalid EBNF" E) ) (put (car @S) 'ebnf @E) ) )</lang>
<lang PicoLisp>(de matchEbnf (Pat)
(cond ((asoq Pat '((PLUS . +) (MINUS . -) (MULT . *) (DIV . /))) (let Op (cdr @) (when (= Op (car *Lst)) (pop '*Lst) Op ) ) ) ((== 'NUMBER Pat) (cond ((num? (car *Lst)) (pop '*Lst) @ ) ((and (= "-" (car *Lst)) (num? (cadr *Lst))) (setq *Lst (cddr *Lst)) (- @) ) ) ) ((get Pat 'ebnf) (parseLst @)) ((atom Pat)) (T (loop (T (matchEbnf (pop 'Pat)) @) (NIL Pat) (NIL (== '| (pop 'Pat))) (NIL Pat) ) ) ) )
(de parseLst (Pat)
(let (P (pop 'Pat) X (matchEbnf P)) (loop (NIL Pat) (if (n== '* (cadr Pat)) (if (matchEbnf (pop 'Pat)) (setq X (list @ X)) (throw) ) (loop (NIL *Lst) (NIL (matchEbnf (car Pat))) (setq X (list @ X (or (matchEbnf P) (throw)))) ) (setq Pat (cddr Pat)) ) ) X ) )
(de parseEbnf (Str)
(let *Lst (str Str "") (catch NIL (parseLst (get 'expr 'ebnf)) ) ) )</lang>
Output:
: (parseEbnf "1 + 2 * -3 / 7 - 3 * 4") -> (- (+ 1 (/ (* 2 -3) 7)) (* 3 4))
Ruby
<lang ruby>#--
- The tokenizer splits the input into Tokens like "identifier",
- ":", ")*" and so on. This design uses a StringScanner on each line of
- input, therefore a Token can never span more than one line.
- Each Token knows its original line and position, so an error message
- can locate a bad token.
- ++
require 'strscan'
- A line of input.
- where:: A location like "file.txt:3"
- str:: String of this line
Line = Struct.new :where, :str
- A token.
- cat:: A category like :colon, :ident or so on
- str:: String of this token
- line:: Line containing this token
- pos:: Position of this token within this line
Token = Struct.new :cat, :str, :line, :pos
- Reads and returns the next Token. At end of file, returns nil.
- --
- Needs @filename and @in.
- ++
def next_token
# Loop until we reach a Token. loop do # If at end of line, then get next line, or else declare end of # file. if @scanner.eos? if s = @in.gets # Each line needs a new Line object. Tokens can hold references # to old Line objects. @line = Line.new("#{@filename}:#{@in.lineno}", s) @scanner.string = s else return nil # End of file end end
# Skip whitespace. break unless @scanner.skip(/space:+/) end
# Read token by regular expression. if s = @scanner.scan(/:/) c = :colon elsif s = @scanner.scan(/;/) c = :semicolon elsif s = @scanner.scan(/\(/) c = :paren elsif s = @scanner.scan(/\)\?/) c = :option elsif s = @scanner.scan(/\)\*/) c = :repeat elsif s = @scanner.scan(/\)/) c = :group elsif s = @scanner.scan(/\|/) c = :bar elsif s = @scanner.scan(/alpha:alnum:*/) c = :ident elsif s = @scanner.scan(/'[^']*'|"[^"]*"/) # Fix syntax highlighting for Rosetta Code. => ' c = :string elsif s = @scanner.scan(/'[^']*|"[^"]*/) c = :bad_string elsif s = @scanner.scan(/.*/) c = :unknown end
Token.new(c, s, @line, (@scanner.pos - s.length))
end
- Prints a _message_ to standard error, along with location of _token_.
def error(token, message)
line = token.line
# We print a caret ^ pointing at the bad token. We make a very crude # attempt to align the caret ^ in the correct column. If the input # line has a non-[:print:] character, like a tab, then we print it as # a space. STDERR.puts <<EOF
- {line.where}: #{message}
- {line.str.gsub(/[^[:print:]]/, " ")}
- {" " * token.pos}^
EOF end
- --
- The parser converts Tokens to a Grammar object. The parser also
- detects syntax errors.
- ++
- A parsed EBNF grammar. It is an Array of Productions.
class Grammar < Array; end
- A production.
- ident:: The identifier
- alts:: An Array of Alternatives
Production = Struct.new :ident, :alts
- An array of Alternatives, as from "(a | b)".
class Group < Array; end
- An optional group, as from "(a | b)?".
class OptionalGroup < Group; end
- A repeated group, as from "(a | b)*".
class RepeatedGroup < Group; end
- An array of identifiers and string literals.
class Alternative < Array; end
- --
- Needs @filename and @in.
- ++
def parse
# TODO: this only dumps the tokens. while t = next_token error(t, "#{t.cat}") end
end
- Set @filename and @in. Parse input.
case ARGV.length when 0 then @filename = "-" when 1 then @filename = ARGV[0] else fail "Too many arguments" end open(@filename) do |f|
@in = f @scanner = StringScanner.new("") parse
end </lang>
Tcl
Demonstration lexer and parser. Note that this parser supports parenthesized expressions, making the grammar recursive. <lang tcl>package require Tcl 8.6
- Utilities to make the coroutine easier to use
proc provide args {while {![yield $args]} {yield}} proc next lexer {$lexer 1} proc pushback lexer {$lexer 0}
- Lexical analyzer coroutine core
proc lexer {str} {
yield [info coroutine] set symbols {+ PLUS - MINUS * MULT / DIV ( LPAR ) RPAR} set idx 0 while 1 {
switch -regexp -matchvar m -- $str { {^\s+} { # No special action for whitespace } {^([-+*/()])} { provide [dict get $symbols [lindex $m 1]] [lindex $m 1] $idx } {^(\d+)} { provide NUMBER [lindex $m 1] $idx } {^$} { provide EOT "EOT" $idx return } . { provide PARSE_ERROR [lindex $m 0] $idx } } # Trim the matched string set str [string range $str [string length [lindex $m 0]] end] incr idx [string length [lindex $m 0]]
}
}
- Utility functions to help with making an LL(1) parser; ParseLoop handles
- EBNF looping constructs, ParseSeq handles sequence constructs.
proc ParseLoop {lexer def} {
upvar 1 token token payload payload index index foreach {a b} $def {
if {$b ne "-"} {set b [list set c $b]} lappend m $a $b
} lappend m default {pushback $lexer; break} while 1 {
lassign [next $lexer] token payload index switch -- $token {*}$m if {[set c [catch {uplevel 1 $c} res opt]]} { dict set opt -level [expr {[dict get $opt -level]+1}] return -options $opt $res }
}
} proc ParseSeq {lexer def} {
upvar 1 token token payload payload index index foreach {t s} $def {
lassign [next $lexer] token payload index switch -- $token $t { if {[set c [catch {uplevel 1 $s} res opt]]} { dict set opt -level [expr {[dict get $opt -level]+1}] return -options $opt $res } } EOT { throw SYNTAX "end of text at position $index" } default { throw SYNTAX "\"$payload\" at position $index" }
}
}
- Main parser driver; contains "master" grammar that ensures that the whole
- text is matched and not just a prefix substring. Note also that the parser
- runs the lexer as a coroutine (with a fixed name in this basic demonstration
- code).
proc parse {str} {
set lexer [coroutine l lexer $str] try {
set parsed [parse.expr $lexer] ParseLoop $lexer { EOT { return $parsed } } throw SYNTAX "\"$payload\" at position $index"
} trap SYNTAX msg {
return -code error "syntax error: $msg"
} finally {
catch {rename $lexer ""}
}
}
- Now the descriptions of how to match each production in the grammar...
proc parse.expr {lexer} {
set expr [parse.term $lexer] ParseLoop $lexer {
PLUS - MINUS { set expr [list $token $expr [parse.term $lexer]] }
} return $expr
} proc parse.term {lexer} {
set term [parse.factor $lexer] ParseLoop $lexer {
MULT - DIV { set term [list $token $term [parse.factor $lexer]] }
} return $term
} proc parse.factor {lexer} {
ParseLoop $lexer {
NUMBER { return $payload } MINUS { ParseSeq $lexer { NUMBER {return -$payload} } } LPAR { set result [parse.expr $lexer] ParseSeq $lexer { RPAR {return $result} } break } EOT { throw SYNTAX "end of text at position $index" }
} throw SYNTAX "\"$payload\" at position $index"
}</lang>
<lang tcl># Demonstration code puts [parse "1 - 2 - -3 * 4 + 5"] puts [parse "1 - 2 - -3 * (4 + 5)"]</lang> Output:
PLUS {MINUS {MINUS 1 2} {MULT -3 4}} 5 MINUS {MINUS 1 2} {MULT -3 {PLUS 4 5}}