Parse EBNF: Difference between revisions
(Want to {{improve}} both examples.) |
(→{{header|Tcl}}: Add in-progress, incomplete Ruby.) |
||
Line 52: | Line 52: | ||
: (parse "(1+2+3)*-4/(1+2)") |
: (parse "(1+2+3)*-4/(1+2)") |
||
-> "DIV {MULT {PLUS {PLUS 1 2} 3} -4} {PLUS 1 2}"</pre> |
-> "DIV {MULT {PLUS {PLUS 1 2} 3} -4} {PLUS 1 2}"</pre> |
||
=={{header|Ruby}}== |
|||
{{in progress|lang=Ruby|day=12|month=May|year=2011}} |
|||
{{incomplete|Ruby|This code divides the input into lexical tokens, but does not parse the tokens by grammar.}} |
|||
<lang ruby>require 'strscan' |
|||
Line = Struct.new :where, :str |
|||
Token = Struct.new :cat, :str, :line, :pos |
|||
# Reads and returns the next Token. At end of file, returns nil. |
|||
def next_token |
|||
# Loop until we reach a Token. |
|||
loop do |
|||
# If before first line, or if at end of line, |
|||
# then get next line, or else declare end of file. |
|||
if (not @scanner) or @scanner.eos? |
|||
if s = @in.gets |
|||
# Each line needs a new Line object. Tokens can hold references |
|||
# to old Line objects. |
|||
@line = Line.new("#{@filename}:#{@in.lineno}", s) |
|||
@scanner = StringScanner.new(s) |
|||
else |
|||
return nil # End of file |
|||
end |
|||
end |
|||
# Skip whitespace. |
|||
break unless @scanner.skip(/[[:space:]]+/) |
|||
end |
|||
# Find token by regexp. |
|||
# The :unknown regexp must not swallow any good tokens. |
|||
if s = @scanner.scan(/:/) |
|||
c = :colon |
|||
elsif s = @scanner.scan(/;/) |
|||
c = :semicolon |
|||
elsif s = @scanner.scan(/\(/) |
|||
c = :group |
|||
elsif s = @scanner.scan(/\)\?/) |
|||
c = :option |
|||
elsif s = @scanner.scan(/\)\*/) |
|||
c = :repeat |
|||
elsif s = @scanner.scan(/\)/) |
|||
c = :one |
|||
elsif s = @scanner.scan(/\|/) |
|||
c = :bar |
|||
elsif s = @scanner.scan(/'[^']*'|"[^"]*"/) |
|||
c = :string |
|||
elsif s = @scanner.scan(/[[:alpha:]][[:alnum:]]*/) |
|||
c = :ident |
|||
elsif s = @scanner.scan(/[^:;()'"[:alpha:][:space:]]*/) |
|||
c = :unknown |
|||
end |
|||
Token.new(c, s, @line, (@scanner.pos - s.length)) |
|||
end |
|||
def read_it |
|||
# TODO: this only dumps the tokens. |
|||
while t = next_token |
|||
p t |
|||
end |
|||
end |
|||
# Read files from ARGV, or else read STDIN. |
|||
case ARGV.length |
|||
when 0 then @filename = "-" |
|||
when 1 then @filename = ARGV[0] |
|||
else fail "Too many arguments" |
|||
end |
|||
open(@filename) { |f| @in = f; read_it }</lang> |
|||
=={{header|Tcl}}== |
=={{header|Tcl}}== |
Revision as of 00:51, 12 May 2011
Create a simple parser for EBNF grammars. Here is an ebnf grammar in itself and a parser for it in php.
- You can use regular expressions for lexing.
- Generate the calculator in Arithmetic evaluation using an EBNF description of the calculator.
Here are simple parser rules for a calculator taken from the antlr tutorial
expr : term ( ( PLUS | MINUS ) term )* ; term : factor ( ( MULT | DIV ) factor )* ; factor : NUMBER ;
PicoLisp
<lang PicoLisp>(de parse (Str)
(let *L (str Str "") (aggr) ) )
(de aggr ()
(let X (prod) (while (member (car *L) '("+" "-")) (setq X (op (if (= "+" (pop '*L)) "PLUS" "MINUS") X (prod))) ) X ) )
(de prod ()
(let X (term) (while (member (car *L) '("*" "/")) (setq X (op (if (= "*" (pop '*L)) "MULT" "DIV") X (term))) ) X ) )
(de term ()
(let X (pop '*L) (cond ((num? X) X) ((= "+" X) (term)) ((= "-" X) (pack "-" (term))) ((= "(" X) (prog1 (aggr) (pop '*L)))) ) ) )
(de op (Op Ex1 Ex2)
(pack Op " " (subx Ex1) " " (subx Ex2)) )
(de subx (Ex)
(if (sub? " " Ex) (pack "{" Ex "}") Ex) ) )</lang>
Output:
: (parse "1-2 - -3 * (4+5)") -> "MINUS {MINUS 1 2} {MULT -3 {PLUS 4 5}}" : (parse "1-2 - -3 * 4+5") -> "PLUS {MINUS {MINUS 1 2} {MULT -3 4}} 5" : (parse "(1+2+3)*-4/(1+2)") -> "DIV {MULT {PLUS {PLUS 1 2} 3} -4} {PLUS 1 2}"
Ruby
<lang ruby>require 'strscan'
Line = Struct.new :where, :str Token = Struct.new :cat, :str, :line, :pos
- Reads and returns the next Token. At end of file, returns nil.
def next_token
# Loop until we reach a Token. loop do # If before first line, or if at end of line, # then get next line, or else declare end of file. if (not @scanner) or @scanner.eos? if s = @in.gets # Each line needs a new Line object. Tokens can hold references # to old Line objects. @line = Line.new("#{@filename}:#{@in.lineno}", s) @scanner = StringScanner.new(s) else return nil # End of file end end
# Skip whitespace. break unless @scanner.skip(/space:+/) end
# Find token by regexp. # The :unknown regexp must not swallow any good tokens. if s = @scanner.scan(/:/) c = :colon elsif s = @scanner.scan(/;/) c = :semicolon elsif s = @scanner.scan(/\(/) c = :group elsif s = @scanner.scan(/\)\?/) c = :option elsif s = @scanner.scan(/\)\*/) c = :repeat elsif s = @scanner.scan(/\)/) c = :one elsif s = @scanner.scan(/\|/) c = :bar elsif s = @scanner.scan(/'[^']*'|"[^"]*"/) c = :string elsif s = @scanner.scan(/alpha:alnum:*/) c = :ident elsif s = @scanner.scan(/[^:;()'"[:alpha:][:space:]]*/) c = :unknown end
Token.new(c, s, @line, (@scanner.pos - s.length))
end
def read_it
# TODO: this only dumps the tokens. while t = next_token p t end
end
- Read files from ARGV, or else read STDIN.
case ARGV.length when 0 then @filename = "-" when 1 then @filename = ARGV[0] else fail "Too many arguments" end open(@filename) { |f| @in = f; read_it }</lang>
Tcl
Demonstration lexer and parser. Note that this parser supports parenthesized expressions, making the grammar recursive. <lang tcl>package require Tcl 8.6
- Utilities to make the coroutine easier to use
proc provide args {while {![yield $args]} {yield}} proc next lexer {$lexer 1} proc pushback lexer {$lexer 0}
- Lexical analyzer coroutine core
proc lexer {str} {
yield [info coroutine] set symbols {+ PLUS - MINUS * MULT / DIV ( LPAR ) RPAR} set idx 0 while 1 {
switch -regexp -matchvar m -- $str { {^\s+} { # No special action for whitespace } {^([-+*/()])} { provide [dict get $symbols [lindex $m 1]] [lindex $m 1] $idx } {^(\d+)} { provide NUMBER [lindex $m 1] $idx } {^$} { provide EOT "EOT" $idx return } . { provide PARSE_ERROR [lindex $m 0] $idx } } # Trim the matched string set str [string range $str [string length [lindex $m 0]] end] incr idx [string length [lindex $m 0]]
}
}
- Utility functions to help with making an LL(1) parser; ParseLoop handles
- EBNF looping constructs, ParseSeq handles sequence constructs.
proc ParseLoop {lexer def} {
upvar 1 token token payload payload index index foreach {a b} $def {
if {$b ne "-"} {set b [list set c $b]} lappend m $a $b
} lappend m default {pushback $lexer; break} while 1 {
lassign [next $lexer] token payload index switch -- $token {*}$m if {[set c [catch {uplevel 1 $c} res opt]]} { dict set opt -level [expr {[dict get $opt -level]+1}] return -options $opt $res }
}
} proc ParseSeq {lexer def} {
upvar 1 token token payload payload index index foreach {t s} $def {
lassign [next $lexer] token payload index switch -- $token $t { if {[set c [catch {uplevel 1 $s} res opt]]} { dict set opt -level [expr {[dict get $opt -level]+1}] return -options $opt $res } } EOT { throw SYNTAX "end of text at position $index" } default { throw SYNTAX "\"$payload\" at position $index" }
}
}
- Main parser driver; contains "master" grammar that ensures that the whole
- text is matched and not just a prefix substring. Note also that the parser
- runs the lexer as a coroutine (with a fixed name in this basic demonstration
- code).
proc parse {str} {
set lexer [coroutine l lexer $str] try {
set parsed [parse.expr $lexer] ParseLoop $lexer { EOT { return $parsed } } throw SYNTAX "\"$payload\" at position $index"
} trap SYNTAX msg {
return -code error "syntax error: $msg"
} finally {
catch {rename $lexer ""}
}
}
- Now the descriptions of how to match each production in the grammar...
proc parse.expr {lexer} {
set expr [parse.term $lexer] ParseLoop $lexer {
PLUS - MINUS { set expr [list $token $expr [parse.term $lexer]] }
} return $expr
} proc parse.term {lexer} {
set term [parse.factor $lexer] ParseLoop $lexer {
MULT - DIV { set term [list $token $term [parse.factor $lexer]] }
} return $term
} proc parse.factor {lexer} {
ParseLoop $lexer {
NUMBER { return $payload } MINUS { ParseSeq $lexer { NUMBER {return -$payload} } } LPAR { set result [parse.expr $lexer] ParseSeq $lexer { RPAR {return $result} } break } EOT { throw SYNTAX "end of text at position $index" }
} throw SYNTAX "\"$payload\" at position $index"
}</lang>
<lang tcl># Demonstration code puts [parse "1 - 2 - -3 * 4 + 5"] puts [parse "1 - 2 - -3 * (4 + 5)"]</lang> Output:
PLUS {MINUS {MINUS 1 2} {MULT -3 4}} 5 MINUS {MINUS 1 2} {MULT -3 {PLUS 4 5}}