Compiler/Verifying syntax: Difference between revisions
(→{{header|Go}}: Fixed various problems highlighted in the Talk Page and added some more examples to test them.) |
m (add a restriction in right hand syntax) |
||
Line 200:
end
s = strip(s)
# Julia allows expressions like 2 + + + 3, or like true = not false, but these are not allowed here
if occursin(r"(and|or|[\=\<\+\-\*\/])\s*(and|or|[\=\<\+\-\*\/])", s) ||
occursin(r"(^(and|^or|^[\=\<\+\-\*\/]))|((and|or|[\=\<\+\-\*\/])$)", s) ||
occursin(r"(\=|\<)\s*not", s)
return false
end
Line 228 ⟶ 229:
"not 7 < 2",
"4 * (32 - 16) + 9 = 73",
"235 76 + 1",
"true or false = not true",
"not true = false",
]
Line 241 ⟶ 244:
The compiler parses the statement { 4 * (32 - 16) + 9 = 73 } and outputs: true
The compiler parses the statement { 235 76 + 1 } and outputs: false
The compiler parses the statement { true or false = not true } and outputs: false
The compiler parses the statement { not true = false } and outputs: true
</pre>
=={{header|Perl}}==
Made fix for 'not' see Discussion.
|
Revision as of 22:30, 3 January 2020
Verifying Syntax
A Syntax Analyzer that verifies a token stream,
outputs a string "true" if the token stream matches the grammar requirement,
outputs a string "false" if the token stream does not match the grammar.
Task
The program reads input from a file of token stream,
reads it and outputs a string "true" if the token stream matches the grammar,
outputs a string "false" and error messages if the token stream does not match the grammar,
based on the grammar below. The grammar is written in Extended Backus-Naur Form (EBNF).
Grammar
stmt = expr ; expr = expr_level_2; expr_level_2 = expr_level_3 {"or" expr_level_3} ; expr_level_3 = expr_level_4 {"and" expr_level_4} ; expr_level_4 = ["not"] expr_level_5 [('=' | '<') expr_level_5] ; expr_level_5 = expr_level_6 {('+' | '-') expr_level_6} ; expr_level_6 = primary {('*' | '/') primary} ; primary = Identifier | Integer | '(' expr ')' | "true" | "false" ; Integer = Digit {Digit}; Identifier = Letter {Letter | Digit | '_'}; Digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" ; Letter = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z" | "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z" ;
Go
If "and", "or", "not" and "=" are replaced by the corresponding symbols: "&&", "||", "!" and "==", then the task grammar is a subset of Go's own grammar - operator precedence and identifier construction are the same.
After making the appropriate substitutions, we can therefore use the parser in Go's standard library to verify whether the statements are valid or not. As expressions cannot be statements in Go, we simply parse the latter as expressions.
However, before applying the parser, we first need to ensure that the expressions don't include any characters (including non-ASCII) or usages thereof which Go would otherwise permit. Note that it's not necessary to specifically check for '++' and '--' as these are statements in Go and can't appear in expressions anyway.
In particular, after substitutions, '= not' would be allowed by the Go parser so we need to exclude it. Curiously, the Go parser allows something like '2 < 3 < 4' even though it doesn't compile. We need therefore to exclude that also (see Talk page). <lang go>package main
import (
"fmt" "go/parser" "regexp" "strings"
)
var (
re1 = regexp.MustCompile(`[^_a-zA-Z0-9\+\-\*/=<\(\) ]`) re2 = regexp.MustCompile(`\b_\w*\b`) re3 = regexp.MustCompile(`=[ ]*not`) re4 = regexp.MustCompile(`(=|<)[ ]*[^=< ]+[ ]*(=|<)`)
)
var subs = [][2]string{
{"=", "=="}, {" not ", " ! "}, {"(not ", "(! "}, {" or ", " || "}, {" and ", " && "},
}
func possiblyValid(expr string) error {
matches := re1.FindStringSubmatch(expr) if matches != nil { return fmt.Errorf("invalid character %q found", []rune(matches[0])[0]) } if re2.MatchString(expr) { return fmt.Errorf("identifier cannot begin with an underscore") } if re3.MatchString(expr) { return fmt.Errorf("expected operand, found 'not'") } matches = re4.FindStringSubmatch(expr) if matches != nil { return fmt.Errorf("expected EOF, found %q", []rune(matches[2])[0]) } return nil
}
func modify(err error) string {
e := err.Error() for _, sub := range subs { e = strings.ReplaceAll(e, strings.TrimSpace(sub[1]), strings.TrimSpace(sub[0])) } return strings.Split(e, ":")[2][1:] // remove location info as may be inaccurate
}
func main() {
exprs := []string{ "(42 + 3", " not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true", " and 3 < 2", "not 7 < 2", "2 < 3 < 4", "4 * (32 - 16) + 9 = 73", "235 76 + 1", "a + b = not c and false", "a + b = (not c) and false", "ab_c / bd2 or < e_f7", "g not = h", "été = false", "i++", "j & k", "l or _m", }
for _, expr := range exprs { fmt.Printf("Statement to verify: %q\n", expr) if err := possiblyValid(expr); err != nil { fmt.Printf("\"false\" -> %s\n\n", err.Error()) continue } expr = fmt.Sprintf(" %s ", expr) // make sure there are spaces at both ends for _, sub := range subs { expr = strings.ReplaceAll(expr, sub[0], sub[1]) } _, err := parser.ParseExpr(expr) if err != nil { fmt.Println(`"false" ->`, modify(err)) } else { fmt.Println(`"true"`) } fmt.Println() }
}</lang>
- Output:
Statement to verify: "(42 + 3" "false" -> expected ')', found newline Statement to verify: " not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true" "true" Statement to verify: " and 3 < 2" "false" -> expected operand, found 'and' Statement to verify: "not 7 < 2" "true" Statement to verify: "2 < 3 < 4" "false" -> expected EOF, found '<' Statement to verify: "4 * (32 - 16) + 9 = 73" "true" Statement to verify: "235 76 + 1" "false" -> expected 'EOF', found 76 Statement to verify: "a + b = not c and false" "false" -> expected operand, found 'not' Statement to verify: "a + b = (not c) and false" "true" Statement to verify: "ab_c / bd2 or < e_f7" "false" -> expected operand, found '<' Statement to verify: "g not = h" "false" -> expected 'EOF', found 'not' Statement to verify: "été = false" "false" -> invalid character 'é' found Statement to verify: "i++" "false" -> expected 'EOF', found '++' Statement to verify: "j & k" "false" -> invalid character '&' found Statement to verify: "l or _m" "false" -> identifier cannot begin with an underscore
Julia
<lang julia>function substituteinnerparentheses(s, subs)
((i = findlast('(', s)) == nothing) && return (s, false) ((j = findfirst(')', s[i:end])) == nothing) && return (s, false) okparse(s[i+1:j+i-2]) || return (s, false) return s[1:i-1] * " " * subs * " " * s[j+i:end], true
end
function okparse(s)
while findfirst('(', s) != nothing s, okinparentheses = substituteinnerparentheses(s, "true") okinparentheses || return false end s = strip(s) # Julia allows expressions like 2 + + + 3, or like true = not false, but these are not allowed here if occursin(r"(and|or|[\=\<\+\-\*\/])\s*(and|or|[\=\<\+\-\*\/])", s) || occursin(r"(^(and|^or|^[\=\<\+\-\*\/]))|((and|or|[\=\<\+\-\*\/])$)", s) || occursin(r"(\=|\<)\s*not", s) return false end # Julia allows ., ,, ; and operators like % but these are not allowed here # permitted: -+*/ true false and or not, ascii identifiers, and integers for item in split(s, r"\s+") !occursin( r"^[a-zA-Z][a-zA-Z_0-9]*$|^\d+$|^true$|^false$|^or$|^and$|^not$|^\=$|^\<$|^\+$|^-$|^\*$|^\/$", item) && return false end # change and, or, and not to the corresponding Julia operators s = replace(replace(replace(s, "and" => "&&"), "or" => "||"), "not" => "!") try # Use Julia's parser, which will throw exception if it parses an error Meta.parse(s) catch return false end return true
end
teststatements = [ " not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true", " and 3 < 2", "not 7 < 2", "4 * (32 - 16) + 9 = 73", "235 76 + 1", "true or false = not true", "not true = false", ]
for s in teststatements
println("The compiler parses the statement { $s } and outputs: ", okparse(s))
end
</lang>
- Output:
The compiler parses the statement { not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true } and outputs: true The compiler parses the statement { and 3 < 2 } and outputs: false The compiler parses the statement { not 7 < 2 } and outputs: true The compiler parses the statement { 4 * (32 - 16) + 9 = 73 } and outputs: true The compiler parses the statement { 235 76 + 1 } and outputs: false The compiler parses the statement { true or false = not true } and outputs: false The compiler parses the statement { not true = false } and outputs: true
Perl
Made fix for 'not' see Discussion. <lang>#!/usr/bin/perl
use strict; use warnings;
sub error { die s/\G/Template:\U@/r } sub want { /\G\Q$_[0]/gc ? pop : error $_[1]; 1 }
sub expr
{ my $level = shift; /\G\h+/gc; /\G(true|false)\b/gc or /\G\(/gc && want ')', "expected a closing paren", expr(0) or /\G(?=(and|or)\b)/gc && error "expected a primary, '$1' is an operator" or $level <= 4 && /\Gnot\b/gc && expr(5) or /\G(?:\d+|[a-zA-Z]\w*)/gc or error "expected a primary"; /\G\h+/gc ? 1 : $level <= 2 && /\Gor\b/gc ? expr(2) : $level <= 3 && /\Gand\b/gc ? expr(4) : $level <= 4 && /\G[=<]/gc ? expr(4.5) : $level == 4.5 && /\G(?=[=<])/gc ? error "operator is non-associative" : $level <= 5 && /\G[+-]/gc ? expr(6) : $level <= 6 && /\G[*\/]/gc ? expr(7) : return 1 while 1; }
while( )
{ print eval { want "\n", "expected end of input", expr(0) } ? "${_}true\n" : "$@false\n", "\n"; }
__DATA__ (42 + 3 not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true and 3 < 2 not 7 < 2 2 < 3 < 4 4 * (32 - 16) + 9 = 73 235 76 + 1 a + b = not c and false a + b = (not c) and false ab_c / bd2 or < e_f7 g not = h été = false i++ j & k l or _m</lang>
- Output:
(42 + 3{{EXPECTED A CLOSING PAREN}} false not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true true {{EXPECTED A PRIMARY, 'AND' IS AN OPERATOR}}and 3 < 2 false not 7 < 2 true 2 < 3 {{OPERATOR IS NON-ASSOCIATIVE}}< 4 false 4 * (32 - 16) + 9 = 73 true 235 {{EXPECTED END OF INPUT}}76 + 1 false a + b = not {{EXPECTED END OF INPUT}}c and false false a + b = (not c) and false true ab_c / bd2 or {{EXPECTED A PRIMARY}}< e_f7 false g {{EXPECTED END OF INPUT}}not = h false {{EXPECTED A PRIMARY}}été = false false i+{{EXPECTED A PRIMARY}}+ false j {{EXPECTED END OF INPUT}}& k false l or {{EXPECTED A PRIMARY}}_m false
Phix
<lang Phix>-- demo\rosetta\Compiler\Verify_Syntax.exw string src integer ch, sdx
procedure skip_spaces()
while 1 do if sdx>length(src) then exit end if ch = src[sdx] if not find(ch," \t\r\n") then exit end if sdx += 1 end while
end procedure
enum SYMBOL, STRING, INTEGER, IDENT, ERROR, EOF constant toktypes = {"SYMBOL","STRING","INTEGER","IDENT","ERROR","EOF"} sequence tok
function sprintok(string fmt)
tok[1] = toktypes[tok[1]] return sprintf(fmt,{tok})
end function
procedure next_token() -- yeilds a single character token, one of: -- {SYMBOL,ch} where ch is one of "{}()[]|=.;", or -- {STRING,string}, or -- {INTEGER,n}, or -- {IDENT,string}, or -- {ERROR,msg}, or -- {EOF}
skip_spaces() integer tokstart = sdx if tok[1]=ERROR then ?{"erm, tok is",tok} -- looping?? elsif sdx>length(src) then tok = {EOF} elsif find(ch,"()+-/*=&<") then sdx += 1 tok = {SYMBOL,ch&""} elsif ch='\"' or ch='\ then integer closech = ch tok = {ERROR,"no closing quote"} sdx += 1 for tokend=sdx to length(src) do if src[tokend]=closech then sdx = tokend+1 tok = {STRING,src[tokstart+1..tokend-1]} exit end if end for elsif (ch>='0' and ch<='9') then integer n = ch-'0' while true do sdx += 1 if sdx>length(src) then exit end if ch = src[sdx] if ch<'0' or ch>'9' then exit end if n = n*10 + ch-'0' end while tok = {INTEGER,n} elsif (ch>='a' and ch<='z') or (ch>='A' and ch<='Z') then while true do sdx += 1 if sdx>length(src) then exit end if ch = src[sdx] if ch!='_' and (ch<'a' or ch>'z') and (ch<'A' or ch>'Z') and (ch<'0' or ch>'9') then exit end if end while tok = {IDENT,src[tokstart..sdx-1]} elsif ch='_' then tok = {ERROR,"identifiers may not start with _"} sdx += 1 else tok = {ERROR,sprintf("illegal char (%c/%d)",ch)} sdx += 1 end if
end procedure
forward procedure or_expr()
procedure primary()
integer tt = tok[1] if tt=IDENT or tt=INTEGER or (tt=STRING and find(tok[2],{"true","false"})) then next_token() elsif tok={SYMBOL,"("} then next_token() or_expr() if tok!={SYMBOL,")"} then tok = {ERROR,") expected"} else next_token() end if else tok = {ERROR,sprintok("invalid [%v]")} end if
end procedure
procedure mul_expr()
while true do primary() if not find(tok,{{SYMBOL,"*"},{SYMBOL,"/"}}) then exit end if next_token() end while
end procedure
procedure sum_expr()
while true do mul_expr() if not find(tok,{{SYMBOL,"+"},{SYMBOL,"-"}}) then exit end if next_token() end while
end procedure
procedure cmp_expr()
if tok=={IDENT,"not"} then next_token() end if while true do sum_expr() if not find(tok,{{SYMBOL,"="},{SYMBOL,"<"}}) then exit end if next_token() end while
end procedure
procedure and_expr()
while true do cmp_expr() if tok!={IDENT,"and"} then exit end if next_token() end while
end procedure
procedure or_expr()
while true do and_expr() if tok!={IDENT,"or"} then exit end if next_token() end while
end procedure
procedure statement()
or_expr()
end procedure
procedure verify_syntax(string source)
src = source sdx = 1 tok = {0} -- ("not error"/invalid-ish) next_token() statement() printf(1,"%30s ==> %s\n",{source,iff(tok[1]=EOF?"pass":sprintok("fail [tok=%v]"))})
end procedure
constant tests = {
"(42 + 3", " not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true", " and 3 < 2", "not 7 < 2", "4 * (32 - 16) + 9 = 73", "235 76 + 1", "a + b = not c and false", "a + b = (not c) and false", "ab_c / bd2 or < e_f7", "g not = h", "été = false", "i++", "j & k", "l or _m"}
for i=1 to length(tests) do
verify_syntax(tests[i])
end for</lang>
- Output:
Note that "= not c" fails, whereas "= (not c)" passes, see talk page.
(42 + 3 ==> fail [tok={"ERROR",") expected"}] not 3 < 4 or (true or 3 / 4 + 8 * 5 - 5 * 2 < 56) and 4 * 3 < 12 or not true ==> pass and 3 < 2 ==> fail [tok={"INTEGER",3}] not 7 < 2 ==> pass 4 * (32 - 16) + 9 = 73 ==> pass 235 76 + 1 ==> fail [tok={"INTEGER",76}] a + b = not c and false ==> fail [tok={"IDENT","c"}] a + b = (not c) and false ==> pass ab_c / bd2 or < e_f7 ==> fail [tok={"ERROR",`invalid [{"SYMBOL","<"}]`}] g not = h ==> fail [tok={"IDENT","not"}] â??®tâ??® = false ==> fail [tok={"ERROR",`invalid [{"ERROR","illegal char (â??/195)"}]`}] i++ ==> fail [tok={"ERROR",`invalid [{"SYMBOL","+"}]`}] j & k ==> fail [tok={"SYMBOL","&"}] l or _m ==> fail [tok={"ERROR",`invalid [{"ERROR","identifiers may not start with _"}]`}]