User:Ed Davis: Difference between revisions
No edit summary |
No edit summary |
||
Line 163: | Line 163: | ||
<lang c> |
<lang c> |
||
/* |
/* |
||
All lexical tokens - not |
All lexical tokens - not syntactically correct, but that will |
||
have to wait until syntax analysis |
have to wait until syntax analysis |
||
*/ |
*/ |
||
Line 232: | Line 232: | ||
;Implementations |
;Implementations |
||
__TOC__ |
|||
=={{header|C}}== |
=={{header|C}}== |
||
Line 288: | Line 287: | ||
} |
} |
||
static |
static int next_ch() { /* get next char from input */ |
||
the_ch = getc(source_fp); |
the_ch = getc(source_fp); |
||
++col; |
++col; |
||
Line 295: | Line 294: | ||
col = 0; |
col = 0; |
||
} |
} |
||
return the_ch; |
|||
} |
} |
||
Line 301: | Line 301: | ||
error(err_line, err_col, "gettok: empty character constant"); |
error(err_line, err_col, "gettok: empty character constant"); |
||
if (the_ch == '\\') { |
if (the_ch == '\\') { |
||
next_ch(); |
|||
if (the_ch == 'n') |
if (the_ch == 'n') |
||
n = 10; |
n = 10; |
||
Line 308: | Line 308: | ||
else error(err_line, err_col, "gettok: unknown escape sequence \\%c", the_ch); |
else error(err_line, err_col, "gettok: unknown escape sequence \\%c", the_ch); |
||
} |
} |
||
if (next_ch() != '\'') |
|||
read_ch(); |
|||
error(err_line, err_col, "multi-character constant"); |
|||
next_ch(); |
|||
return (tok_s){Integerk, err_line, err_col, {n}}; |
return (tok_s){Integerk, err_line, err_col, {n}}; |
||
} |
} |
||
Line 320: | Line 320: | ||
/* comment found */ |
/* comment found */ |
||
for (;;) { |
for (;;) { |
||
if (next_ch() == '*' && next_ch() == '/') { |
|||
read_ch(); |
|||
next_ch(); |
|||
return gettok(); |
|||
} else if (the_ch == EOF) |
|||
error(err_line, err_col, "EOF in comment"); |
|||
return gettok(); |
|||
} |
|||
} |
|||
} |
} |
||
} |
} |
||
Line 334: | Line 331: | ||
da_rewind(text); |
da_rewind(text); |
||
while (next_ch() != start) { |
|||
if (the_ch == '\n') |
if (the_ch == '\n') error(err_line, err_col, "EOL in string"); |
||
error(err_line, err_col, " |
if (the_ch == EOF) error(err_line, err_col, "EOF in string"); |
||
if (the_ch == EOF) |
|||
error(err_line, err_col, "EOF in string"); |
|||
da_append(text, (char)the_ch); |
da_append(text, (char)the_ch); |
||
} |
} |
||
da_append(text, '\0'); |
da_append(text, '\0'); |
||
next_ch(); |
|||
return (tok_s){Stringk, err_line, err_col, {.text=text}}; |
return (tok_s){Stringk, err_line, err_col, {.text=text}}; |
||
} |
} |
||
Line 373: | Line 368: | ||
if (!isdigit(the_ch)) |
if (!isdigit(the_ch)) |
||
is_number = false; |
is_number = false; |
||
next_ch(); |
|||
} |
} |
||
if (da_len(text) == 0) |
if (da_len(text) == 0) |
||
Line 391: | Line 386: | ||
static tok_s follow(int expect, TokenType ifyes, TokenType ifno, int err_line, int err_col) { /* look ahead for '>=', etc. */ |
static tok_s follow(int expect, TokenType ifyes, TokenType ifno, int err_line, int err_col) { /* look ahead for '>=', etc. */ |
||
if (the_ch == expect) { |
if (the_ch == expect) { |
||
next_ch(); |
|||
return (tok_s){ifyes, err_line, err_col, {0}}; |
return (tok_s){ifyes, err_line, err_col, {0}}; |
||
} |
} |
||
if (ifno == EOI) |
|||
if (ifno == EOI) error(err_line, err_col, "follow: unrecognized character '%c' (%d)\n", the_ch, the_ch); |
|||
error(err_line, err_col, "follow: unrecognized character '%c' (%d)\n", the_ch, the_ch); |
|||
return (tok_s){ifno, err_line, err_col, {0}}; |
return (tok_s){ifno, err_line, err_col, {0}}; |
||
} |
} |
||
Line 401: | Line 397: | ||
/* skip white space */ |
/* skip white space */ |
||
while (isspace(the_ch)) |
while (isspace(the_ch)) |
||
next_ch(); |
|||
int err_line = line; |
int err_line = line; |
||
int err_col = col; |
int err_col = col; |
||
switch (the_ch) { |
switch (the_ch) { |
||
case '{': |
case '{': next_ch(); return (tok_s){Lbrace, err_line, err_col, {0}}; |
||
case '}': |
case '}': next_ch(); return (tok_s){Rbrace, err_line, err_col, {0}}; |
||
case '(': |
case '(': next_ch(); return (tok_s){Lparen, err_line, err_col, {0}}; |
||
case ')': |
case ')': next_ch(); return (tok_s){Rparen, err_line, err_col, {0}}; |
||
case '+': |
case '+': next_ch(); return (tok_s){Add, err_line, err_col, {0}}; |
||
case '-': |
case '-': next_ch(); return (tok_s){Sub, err_line, err_col, {0}}; |
||
case '*': |
case '*': next_ch(); return (tok_s){Mul, err_line, err_col, {0}}; |
||
case ';': |
case ';': next_ch(); return (tok_s){Semi, err_line, err_col, {0}}; |
||
case ',': |
case ',': next_ch(); return (tok_s){Comma, err_line, err_col, {0}}; |
||
case '>': |
case '>': next_ch(); return (tok_s){Gtr, err_line, err_col, {0}}; |
||
case '=': |
case '=': next_ch(); return (tok_s){Assign, err_line, err_col, {0}}; |
||
case '/': |
case '/': next_ch(); return div_or_cmt(err_line, err_col); |
||
case '\'': |
case '\'': next_ch(); return char_lit(the_ch, err_line, err_col); |
||
case '<': |
case '<': next_ch(); return follow('=', Leq, Lss, err_line, err_col); |
||
case '!': |
case '!': next_ch(); return follow('=', Neq, EOI, err_line, err_col); |
||
case '&': |
case '&': next_ch(); return follow('&', And, EOI, err_line, err_col); |
||
case '"' : return string_lit(the_ch, err_line, err_col); |
case '"' : return string_lit(the_ch, err_line, err_col); |
||
default: return ident_or_int(err_line, err_col); |
default: return ident_or_int(err_line, err_col); |
||
Line 436: | Line 432: | ||
"Uminus Mul Div Add Sub Lss Gtr Leq Neq " |
"Uminus Mul Div Add Sub Lss Gtr Leq Neq " |
||
"And Semi Comma Assign Integer String Ident "[tok.tok * 9]); |
"And Semi Comma Assign Integer String Ident "[tok.tok * 9]); |
||
if (tok.tok == Integerk) |
|||
fprintf(dest_fp, " % |
if (tok.tok == Integerk) fprintf(dest_fp, " %4d", tok.n); |
||
else if (tok.tok == Ident) |
else if (tok.tok == Ident) fprintf(dest_fp, " %s", tok.text); |
||
fprintf(dest_fp, " %s", tok.text); |
else if (tok.tok == Stringk) fprintf(dest_fp, " \"%s\"", tok.text); |
||
else if (tok.tok == Stringk) |
|||
fprintf(dest_fp, " \"%s\"", tok.text); |
|||
fprintf(dest_fp, "\n"); |
fprintf(dest_fp, "\n"); |
||
} while (tok.tok != EOI); |
} while (tok.tok != EOI); |
||
Line 460: | Line 454: | ||
run(); |
run(); |
||
} |
} |
||
</lang> |
|||
=={{header|Euphoria}}== |
|||
<lang euphoria> |
|||
include std/io.e |
|||
include std/map.e |
|||
include std/types.e |
|||
include std/convert.e |
|||
constant true = 1, false = 0, EOF = -1 |
|||
enum EOI, Printk, Putc, Ifk, Whilek, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, |
|||
Add, Sub, Lss, Gtr, Leq, Neq, Andk, Semi, Comma, Assign, Integerk, Stringk, Ident |
|||
constant all_syms = { "EOI", "Print", "Putc", "If", "While", "Lbrace", "Rbrace", "Lparen", |
|||
"Rparen", "Uminus", "Mul", "Div", "Add", "Sub", "Lss", "Gtr", "Leq", "Neq", "And", |
|||
"Semi", "Comma", "Assign", "Integer", "String", "Ident"} |
|||
integer input_file, the_ch = ' ', the_col = 0, the_line = 1 |
|||
sequence symbols |
|||
map key_words = new() |
|||
procedure error(sequence format, sequence data) |
|||
printf(STDOUT, format, data) |
|||
abort(1) |
|||
end procedure |
|||
-- get the next character from the input |
|||
function next_ch() |
|||
the_ch = getc(input_file) |
|||
the_col += 1 |
|||
if the_ch = '\n' then |
|||
the_line += 1 |
|||
the_col = 0 |
|||
end if |
|||
return the_ch |
|||
end function |
|||
-- 'x' - character constants |
|||
function char_lit(integer err_line, integer err_col) |
|||
integer n = next_ch() -- skip opening quote |
|||
if the_ch = '\'' then |
|||
error("%d %d empty character constant", {err_line, err_col}) |
|||
elsif the_ch = '\\' then |
|||
next_ch() |
|||
if the_ch = 'n' then |
|||
n = 10 |
|||
elsif the_ch = '\\' then |
|||
n = '\\' |
|||
else |
|||
error("%d %d unknown escape sequence \\%c", {err_line, err_col, the_ch}) |
|||
end if |
|||
end if |
|||
if next_ch() != '\'' then |
|||
error("%d %d multi-character constant", {err_line, err_col}) |
|||
end if |
|||
next_ch() |
|||
return {Integerk, err_line, err_col, n} |
|||
end function |
|||
-- process divide or comments |
|||
function div_or_cmt(integer err_line, integer err_col) |
|||
if next_ch() != '*' then |
|||
return {Div, err_line, err_col} |
|||
end if |
|||
-- comment found |
|||
while true do |
|||
if next_ch() = '*' and next_ch() = '/' then |
|||
next_ch() |
|||
return get_tok() |
|||
elsif the_ch = EOF then |
|||
error("%d %d EOF in comment", {err_line, err_col}) |
|||
end if |
|||
end while |
|||
end function |
|||
-- "string" |
|||
function string_lit(integer start, integer err_line, integer err_col) |
|||
string text = "" |
|||
while next_ch() != start do |
|||
if the_ch = EOF then |
|||
error("%d %d EOF while scanning string literal", {err_line, err_col}) |
|||
end if |
|||
if the_ch = '\n' then |
|||
error("%d %d EOL while scanning string literal", {err_line, err_col}) |
|||
end if |
|||
text &= the_ch |
|||
end while |
|||
next_ch() |
|||
return {Stringk, err_line, err_col, text} |
|||
end function |
|||
-- handle identifiers and integers |
|||
function ident_or_int(integer err_line, integer err_col) |
|||
integer n, is_number = true |
|||
string text = "" |
|||
while t_alnum(the_ch) or the_ch = '_' do |
|||
text &= the_ch |
|||
if not t_digit(the_ch) then |
|||
is_number = false |
|||
end if |
|||
next_ch() |
|||
end while |
|||
if length(text) = 0 then |
|||
error("%d %d ident_or_int: unrecognized character: (%d) '%s'", {err_line, err_col, the_ch, the_ch}) |
|||
end if |
|||
if t_digit(text[1]) then |
|||
if not is_number then |
|||
error("%d %d invalid number: %s", {err_line, err_col, text}) |
|||
end if |
|||
n = to_integer(text) |
|||
return {Integerk, err_line, err_col, n} |
|||
end if |
|||
if has(key_words, text) then |
|||
return {get(key_words, text), err_line, err_col} |
|||
end if |
|||
return {Ident, err_line, err_col, text} |
|||
end function |
|||
-- look ahead for '>=', etc. |
|||
function follow(integer expect, integer ifyes, integer ifno, integer err_line, integer err_col) |
|||
if next_ch() = expect then |
|||
next_ch() |
|||
return {ifyes, err_line, err_col} |
|||
end if |
|||
if ifno = EOI then |
|||
error("%d %d follow: unrecognized character: (%d)", {err_line, err_col, the_ch}) |
|||
end if |
|||
return {ifno, err_line, err_col} |
|||
end function |
|||
-- return the next token type |
|||
function get_tok() |
|||
while t_space(the_ch) do |
|||
next_ch() |
|||
end while |
|||
integer err_line = the_line |
|||
integer err_col = the_col |
|||
switch the_ch do |
|||
case EOF then return {EOI, err_line, err_col} |
|||
case '/' then return div_or_cmt(err_line, err_col) |
|||
case '\'' then return char_lit(err_line, err_col) |
|||
case '<' then return follow('=', Leq, Lss, err_line, err_col) |
|||
case '!' then return follow('=', Neq, EOI, err_line, err_col) |
|||
case '&' then return follow('&', Andk, EOI, err_line, err_col) |
|||
case '"' then return string_lit(the_ch, err_line, err_col) |
|||
case else |
|||
integer sym = symbols[the_ch] |
|||
if sym != EOI then |
|||
next_ch() |
|||
return {sym, err_line, err_col} |
|||
end if |
|||
return ident_or_int(err_line, err_col) |
|||
end switch |
|||
end function |
|||
procedure init() |
|||
put(key_words, "if", Ifk) |
|||
put(key_words, "print", Printk) |
|||
put(key_words, "putc", Putc) |
|||
put(key_words, "while", Whilek) |
|||
symbols = repeat(EOI, 256) |
|||
symbols['{'] = Lbrace |
|||
symbols['}'] = Rbrace |
|||
symbols['('] = Lparen |
|||
symbols[')'] = Rparen |
|||
symbols['+'] = Add |
|||
symbols['-'] = Sub |
|||
symbols['*'] = Mul |
|||
symbols[';'] = Semi |
|||
symbols[','] = Comma |
|||
symbols['>'] = Gtr |
|||
symbols['='] = Assign |
|||
end procedure |
|||
procedure main(sequence cl) |
|||
sequence file_name |
|||
input_file = STDIN |
|||
if length(cl) > 2 then |
|||
file_name = cl[3] |
|||
input_file = open(file_name, "r") |
|||
if input_file = -1 then |
|||
error("Could not open %s", {file_name}) |
|||
end if |
|||
end if |
|||
init() |
|||
sequence t |
|||
loop do |
|||
t = get_tok() |
|||
printf(STDOUT, "line %5d col %5d %-8s", {t[2], t[3], all_syms[t[1]]}) |
|||
switch t[1] do |
|||
case Integerk then printf(STDOUT, " %5d\n", {t[4]}) |
|||
case Ident then printf(STDOUT, " %s\n", {t[4]}) |
|||
case Stringk then printf(STDOUT, " \"%s\"\n", {t[4]}) |
|||
case else printf(STDOUT, "\n") |
|||
end switch |
|||
until t[1] = EOI |
|||
end loop |
|||
end procedure |
|||
main(command_line()) |
|||
</lang> |
</lang> |
||
Line 689: | Line 898: | ||
dim tok_list(tk_eoi to tk_ident) as string |
dim tok_list(tk_eoi to tk_ident) as string |
||
tok_list(tk_eoi |
tok_list(tk_eoi ) = "EOI" |
||
tok_list(tk_print |
tok_list(tk_print ) = "Print" |
||
tok_list(tk_putc |
tok_list(tk_putc ) = "Putc" |
||
tok_list(tk_if |
tok_list(tk_if ) = "If" |
||
tok_list(tk_while |
tok_list(tk_while ) = "While" |
||
tok_list(tk_lbrace |
tok_list(tk_lbrace ) = "Lbrace" |
||
tok_list(tk_rbrace |
tok_list(tk_rbrace ) = "Rbrace" |
||
tok_list(tk_lparen |
tok_list(tk_lparen ) = "Lparen" |
||
tok_list(tk_rparen |
tok_list(tk_rparen ) = "Rparen" |
||
tok_list(tk_uminus |
tok_list(tk_uminus ) = "Uminus" |
||
tok_list(tk_mul |
tok_list(tk_mul ) = "Mul" |
||
tok_list(tk_div |
tok_list(tk_div ) = "Div" |
||
tok_list(tk_add |
tok_list(tk_add ) = "Add" |
||
tok_list(tk_sub |
tok_list(tk_sub ) = "Sub" |
||
tok_list(tk_lss |
tok_list(tk_lss ) = "Lss" |
||
tok_list(tk_gtr |
tok_list(tk_gtr ) = "Gtr" |
||
tok_list(tk_leq |
tok_list(tk_leq ) = "Leq" |
||
tok_list(tk_neq |
tok_list(tk_neq ) = "Neq" |
||
tok_list(tk_and |
tok_list(tk_and ) = "And" |
||
tok_list(tk_semi |
tok_list(tk_semi ) = "Semi" |
||
tok_list(tk_comma |
tok_list(tk_comma ) = "Comma" |
||
tok_list(tk_assign |
tok_list(tk_assign ) = "Assign" |
||
tok_list(tk_integer |
tok_list(tk_integer) = "Integer" |
||
tok_list(tk_string |
tok_list(tk_string ) = "String" |
||
tok_list(tk_ident |
tok_list(tk_ident ) = "Ident" |
||
do |
do |
||
Line 735: | Line 944: | ||
=={{header|Python}}== |
=={{header|Python}}== |
||
<lang Python> |
<lang Python> |
||
from __future__ import print_function |
|||
import sys |
import sys |
||
Line 762: | Line 972: | ||
#*** get the next character from the input |
#*** get the next character from the input |
||
def |
def next_ch(): |
||
global the_ch, the_col, the_line |
global the_ch, the_col, the_line |
||
Line 774: | Line 984: | ||
#*** 'x' - character constants |
#*** 'x' - character constants |
||
def char_lit(err_line, err_col): |
def char_lit(err_line, err_col): |
||
n = ord( |
n = ord(next_ch()) # skip opening quote |
||
if the_ch == '\'': |
if the_ch == '\'': |
||
error(err_line, err_col, "empty character constant") |
error(err_line, err_col, "empty character constant") |
||
elif the_ch == '\\': |
elif the_ch == '\\': |
||
next_ch() |
|||
if the_ch == 'n': |
if the_ch == 'n': |
||
n = 10 |
n = 10 |
||
Line 785: | Line 995: | ||
else: |
else: |
||
error(err_line, err_col, "unknown escape sequence \\%c" % (the_ch)) |
error(err_line, err_col, "unknown escape sequence \\%c" % (the_ch)) |
||
if |
if next_ch() != '\'': |
||
error(err_line, err_col, "multi-character constant") |
error(err_line, err_col, "multi-character constant") |
||
next_ch() |
|||
return Integerk, err_line, err_col, n |
return Integerk, err_line, err_col, n |
||
#*** process divide or comments |
#*** process divide or comments |
||
def div_or_cmt(err_line, err_col): |
def div_or_cmt(err_line, err_col): |
||
if |
if next_ch() != '*': |
||
return Div, err_line, err_col |
return Div, err_line, err_col |
||
# comment found |
# comment found |
||
while True: |
while True: |
||
if |
if next_ch() == '*' and next_ch() == '/': |
||
next_ch() |
|||
return gettok() |
return gettok() |
||
elif len(the_ch) == 0: |
elif len(the_ch) == 0: |
||
Line 807: | Line 1,017: | ||
text = "" |
text = "" |
||
while |
while next_ch() != start: |
||
if len(the_ch) == 0: |
if len(the_ch) == 0: |
||
error(err_line, err_col, "EOF while scanning string literal") |
error(err_line, err_col, "EOF while scanning string literal") |
||
Line 814: | Line 1,024: | ||
text += the_ch |
text += the_ch |
||
next_ch() |
|||
return Stringk, err_line, err_col, text |
return Stringk, err_line, err_col, text |
||
Line 826: | Line 1,036: | ||
if not the_ch.isdigit(): |
if not the_ch.isdigit(): |
||
is_number = False |
is_number = False |
||
next_ch() |
|||
if len(text) == 0: |
if len(text) == 0: |
||
Line 844: | Line 1,054: | ||
#*** look ahead for '>=', etc. |
#*** look ahead for '>=', etc. |
||
def follow(expect, ifyes, ifno, err_line, err_col): |
def follow(expect, ifyes, ifno, err_line, err_col): |
||
if |
if next_ch() == expect: |
||
next_ch() |
|||
return ifyes, err_line, err_col |
return ifyes, err_line, err_col |
||
Line 856: | Line 1,066: | ||
def gettok(): |
def gettok(): |
||
while the_ch.isspace(): |
while the_ch.isspace(): |
||
next_ch() |
|||
err_line = the_line |
err_line = the_line |
||
Line 862: | Line 1,072: | ||
if len(the_ch) == 0: return EOI, err_line, err_col |
if len(the_ch) == 0: return EOI, err_line, err_col |
||
elif the_ch in symbols: sym = symbols[the_ch]; getc(); return sym, err_line, err_col |
|||
elif the_ch == '/': return div_or_cmt(err_line, err_col) |
elif the_ch == '/': return div_or_cmt(err_line, err_col) |
||
elif the_ch == '\'': return char_lit(err_line, err_col) |
elif the_ch == '\'': return char_lit(err_line, err_col) |
||
Line 869: | Line 1,078: | ||
elif the_ch == '&': return follow('&', And, EOI, err_line, err_col) |
elif the_ch == '&': return follow('&', And, EOI, err_line, err_col) |
||
elif the_ch == '"': return string_lit(the_ch, err_line, err_col) |
elif the_ch == '"': return string_lit(the_ch, err_line, err_col) |
||
elif the_ch in symbols: |
|||
else: return ident_or_int(err_line, err_col) |
|||
sym = symbols[the_ch] |
|||
next_ch() |
|||
return sym, err_line, err_col |
|||
else: return ident_or_int(err_line, err_col) |
|||
#*** main driver |
#*** main driver |
||
Line 885: | Line 1,098: | ||
col = t[2] |
col = t[2] |
||
print("line %5d col %5d %-8s" % (line, col, all_syms[tok]), end='') |
|||
if tok == Integerk: |
|||
print("line %5d col %5d %-8s %8d" % (line, col, all_syms[tok], t[3])) |
|||
if tok == Integerk: print(" %5d" % (t[3])) |
|||
print(" |
elif tok == Ident: print(" %s" % (t[3])) |
||
elif tok == Stringk: |
elif tok == Stringk: print(' "%s"' % (t[3])) |
||
else: print("") |
|||
else: |
|||
print("line %5d col %5d %-8s" % (line, col, all_syms[tok])) |
|||
if tok == EOI: |
if tok == EOI: |
Revision as of 15:15, 13 August 2016
Lexical analysis is the process of converting a sequence of characters (such as in a computer program or web page) into a sequence of tokens (strings with an identified "meaning"). A program that performs lexical analysis may be called a lexer, tokenizer, or scanner (though "scanner" is also used to refer to the first stage of a lexer).
- The Task
Create a lexical analyzer for the Tiny programming language. The program should read input from a file and/or stdin, and write output to a file and/or stdout.
- Specification
The various token types are denoted below.
- Operators
Characters | Common name | Name |
---|---|---|
* | multiply | Mul |
/ | divide | Div |
+ | plus | Add |
- | minus and unary minus | Sub and Uminus |
< | less than | Lss |
<= | less than or equal | Leq |
> | greater than | Gtr |
!= | not equal | Neq |
= | assign | Assign |
&& | and | And |
- Symbols
Characters | Common name | Name |
---|---|---|
( | left parenthesis | Lparen |
) | right parenthesis | Rparen |
{ | left brace | Lbrace |
} | right brace | Rbrace |
; | semi colon | Semi |
, | comma | Comma |
- Keywords
Characters | Name |
---|---|
if | If |
while | While |
putc | Putc |
- Other entities
Characters | Regular expression | Name |
---|---|---|
integers | [0-9]+ | Integer |
char literal | 'x' | Integer |
identifiers | [_a-zA-Z][_a-zA-Z0-9]+ | Ident |
string literal | ".*" | String |
Notes: For char literals, '\n' is supported as a new line character. To represent \, use: '\\'. \n may also be used in Strings, to print a newline. No other special sequences are supported.
Comments /* ... */ (multi-line)
- Complete list of token names
EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident
- Program output
Output of the program should be:
- the word line, followed by:
- the line number where the token starts, followed by:
- the abbreviation col, followed by:
- the column number where the token starts, followed by:
- the token name.
- If the token name is one of Integer, Ident or String, the actual value of the same should follow.
- Test Cases
<lang c> /*
Hello world */
print("Hello, World!\n"); </lang>
- Output
line 4 col 1 Print line 4 col 6 Lparen line 4 col 7 String "Hello, World!\n" line 4 col 24 Rparen line 4 col 25 Semi line 5 col 1 EOI
<lang c> /*
Show Ident and Integers */
phoenix_number = 142857; print(phoenix_number, "\n"); </lang>
- Output
line 4 col 1 Ident phoenix_number line 4 col 16 Assign line 4 col 18 Integer 142857 line 4 col 24 Semi line 5 col 1 Print line 5 col 6 Lparen line 5 col 7 Ident phoenix_number line 5 col 21 Comma line 5 col 23 String "\n" line 5 col 27 Rparen line 5 col 28 Semi line 6 col 1 EOI
<lang c> /*
All lexical tokens - not syntactically correct, but that will have to wait until syntax analysis */
/* Print */ print /* Sub */ - /* Putc */ putc /* Lss */ < /* If */ if /* Gtr */ > /* While */ while /* Leq */ <= /* Lbrace */ { /* Neq */ != /* Rbrace */ } /* And */ && /* Lparen */ ( /* Semi */ ; /* Rparen */ ) /* Comma */ , /* Uminus */ - /* Assign */ = /* Mul */ * /* Integer */ 42 /* Div */ / /* String */ "String literal" /* Add */ + /* Ident */ variable_name /* character literal */ '\n' /* character literal */ ' ' </lang>
- Output
line 5 col 15 Print line 5 col 41 Sub line 6 col 15 Putc line 6 col 41 Lss line 7 col 15 If line 7 col 41 Gtr line 8 col 15 While line 8 col 41 Leq line 9 col 15 Lbrace line 9 col 41 Neq line 10 col 15 Rbrace line 10 col 41 And line 11 col 15 Lparen line 11 col 41 Semi line 12 col 15 Rparen line 12 col 41 Comma line 13 col 15 Sub line 13 col 41 Assign line 14 col 15 Mul line 14 col 41 Integer 42 line 15 col 15 Div line 15 col 41 String "String literal" line 16 col 15 Add line 16 col 41 Ident variable_name line 17 col 26 Integer 10 line 18 col 26 Integer 32 line 19 col 1 EOI
- Diagnostics
The following error conditions should be caught:
- Empty character constant. Example: ''
- Unknown escape sequence. Example: '\r'
- Multi-character constant. Example: 'xx'
- End-of-file in comment. Closing comment characters not found.
- End-of-file while scanning string literal. Closing string character not found.
- End-of-line while scanning string literal. Closing string character not found before end-of-line.
- Unrecognized character. Example: |
- Reference
The C and Python versions can be considered reference implementations.
- Implementations
C
<lang C>
- include <stdlib.h>
- include <stdio.h>
- include <stdarg.h>
- include <ctype.h>
- include <string.h>
- include <errno.h>
- include <stdbool.h>
- include <limits.h>
- define NELEMS(arr) (sizeof(arr) / sizeof(arr[0]))
- define da_dim(name, type) type *name = NULL; \
int _qy_ ## name ## _p = 0; \ int _qy_ ## name ## _max = 0
- define da_rewind(name) _qy_ ## name ## _p = 0
- define da_redim(name) if (_qy_ ## name ## _p >= _qy_ ## name ## _max) \
name = realloc(name, (_qy_ ## name ## _max += 32) * sizeof(name[0]))
- define da_append(name, x) do {da_redim(name); name[_qy_ ## name ## _p++] = x;} while (0)
- define da_len(name) _qy_ ## name ## _p
// dependancy: atr table in parse.c ordering is based on these typedef enum {
EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident
} TokenType;
typedef struct {
int tok; int err_ln, err_col; union { int n; /* value for constants */ char *text; /* text for idents */ };
} tok_s;
static FILE *source_fp, *dest_fp; static int line = 1, col = 0, the_ch = ' '; da_dim(text, char);
tok_s gettok();
static void error(int err_line, int err_col, const char *fmt, ... ) {
char buf[1000]; va_list ap;
va_start(ap, fmt); vsprintf(buf, fmt, ap); va_end(ap); printf("(%d,%d) error: %s\n", err_line, err_col, buf); exit(1);
}
static int next_ch() { /* get next char from input */
the_ch = getc(source_fp); ++col; if (the_ch == '\n') { ++line; col = 0; } return the_ch;
}
static tok_s char_lit(int n, int err_line, int err_col) { /* 'x' */
if (the_ch == '\) error(err_line, err_col, "gettok: empty character constant"); if (the_ch == '\\') { next_ch(); if (the_ch == 'n') n = 10; else if (the_ch == '\\') n = '\\'; else error(err_line, err_col, "gettok: unknown escape sequence \\%c", the_ch); } if (next_ch() != '\) error(err_line, err_col, "multi-character constant"); next_ch(); return (tok_s){Integerk, err_line, err_col, {n}};
}
static tok_s div_or_cmt(int err_line, int err_col) { /* process divide or comments */
if (the_ch != '*') return (tok_s){Div, err_line, err_col, {0}};
/* comment found */ for (;;) { if (next_ch() == '*' && next_ch() == '/') { next_ch(); return gettok(); } else if (the_ch == EOF) error(err_line, err_col, "EOF in comment"); }
}
static tok_s string_lit(int start, int err_line, int err_col) { /* "st" */
da_rewind(text);
while (next_ch() != start) { if (the_ch == '\n') error(err_line, err_col, "EOL in string"); if (the_ch == EOF) error(err_line, err_col, "EOF in string"); da_append(text, (char)the_ch); } da_append(text, '\0');
next_ch(); return (tok_s){Stringk, err_line, err_col, {.text=text}};
}
static int kwd_cmp(const void *p1, const void *p2) {
return strcmp(*(char **)p1, *(char **)p2);
}
static TokenType get_ident_type(const char *ident) {
static struct { char *s; TokenType sym; } kwds[] = { {"if", If}, {"print", Print}, {"putc", Putc}, {"while", While}, }, *kwp;
return (kwp = bsearch(&ident, kwds, NELEMS(kwds), sizeof(kwds[0]), kwd_cmp)) == NULL ? Ident : kwp->sym;
}
static tok_s ident_or_int(int err_line, int err_col) {
int n, is_number = true;
da_rewind(text); while (isalnum(the_ch) || the_ch == '_') { da_append(text, (char)the_ch); if (!isdigit(the_ch)) is_number = false; next_ch(); } if (da_len(text) == 0) error(err_line, err_col, "gettok: unrecognized character (%d) '%c'\n", the_ch, the_ch); da_append(text, '\0'); if (isdigit(text[0])) { if (!is_number) error(err_line, err_col, "invalid number: %s\n", text); n = strtol(text, NULL, 0); if (n == LONG_MAX && errno == ERANGE) error(err_line, err_col, "Number exceeds maximum value"); return (tok_s){Integerk, err_line, err_col, {n}}; } return (tok_s){get_ident_type(text), err_line, err_col, {.text=text}};
}
static tok_s follow(int expect, TokenType ifyes, TokenType ifno, int err_line, int err_col) { /* look ahead for '>=', etc. */
if (the_ch == expect) { next_ch(); return (tok_s){ifyes, err_line, err_col, {0}}; } if (ifno == EOI) error(err_line, err_col, "follow: unrecognized character '%c' (%d)\n", the_ch, the_ch); return (tok_s){ifno, err_line, err_col, {0}};
}
tok_s gettok() { /* return the token type */
/* skip white space */ while (isspace(the_ch)) next_ch(); int err_line = line; int err_col = col; switch (the_ch) { case '{': next_ch(); return (tok_s){Lbrace, err_line, err_col, {0}}; case '}': next_ch(); return (tok_s){Rbrace, err_line, err_col, {0}}; case '(': next_ch(); return (tok_s){Lparen, err_line, err_col, {0}}; case ')': next_ch(); return (tok_s){Rparen, err_line, err_col, {0}}; case '+': next_ch(); return (tok_s){Add, err_line, err_col, {0}}; case '-': next_ch(); return (tok_s){Sub, err_line, err_col, {0}}; case '*': next_ch(); return (tok_s){Mul, err_line, err_col, {0}}; case ';': next_ch(); return (tok_s){Semi, err_line, err_col, {0}}; case ',': next_ch(); return (tok_s){Comma, err_line, err_col, {0}}; case '>': next_ch(); return (tok_s){Gtr, err_line, err_col, {0}}; case '=': next_ch(); return (tok_s){Assign, err_line, err_col, {0}}; case '/': next_ch(); return div_or_cmt(err_line, err_col); case '\: next_ch(); return char_lit(the_ch, err_line, err_col); case '<': next_ch(); return follow('=', Leq, Lss, err_line, err_col); case '!': next_ch(); return follow('=', Neq, EOI, err_line, err_col); case '&': next_ch(); return follow('&', And, EOI, err_line, err_col); case '"' : return string_lit(the_ch, err_line, err_col); default: return ident_or_int(err_line, err_col); case EOF: return (tok_s){EOI, err_line, err_col, {0}}; }
}
void run() { /* tokenize the given input */
tok_s tok; do { tok = gettok(); fprintf(dest_fp, "line %5d col %5d %.8s", tok.err_ln, tok.err_col, &"EOI Print Putc If While Lbrace Rbrace Lparen Rparen " "Uminus Mul Div Add Sub Lss Gtr Leq Neq " "And Semi Comma Assign Integer String Ident "[tok.tok * 9]);
if (tok.tok == Integerk) fprintf(dest_fp, " %4d", tok.n); else if (tok.tok == Ident) fprintf(dest_fp, " %s", tok.text); else if (tok.tok == Stringk) fprintf(dest_fp, " \"%s\"", tok.text); fprintf(dest_fp, "\n"); } while (tok.tok != EOI); if (dest_fp != stdout) fclose(dest_fp);
}
void init_io(FILE **fp, FILE *std, const char mode[], const char fn[]) {
if (fn[0] == '\0') *fp = std; else if ((*fp = fopen(fn, mode)) == NULL) error(0, 0, "Can't open %s\n", fn);
}
int main(int argc, char *argv[]) {
init_io(&source_fp, stdin, "r", argc > 1 ? argv[1] : ""); init_io(&dest_fp, stdout, "wb", argc > 2 ? argv[2] : ""); run();
} </lang>
Euphoria
<lang euphoria> include std/io.e include std/map.e include std/types.e include std/convert.e
constant true = 1, false = 0, EOF = -1
enum EOI, Printk, Putc, Ifk, Whilek, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div,
Add, Sub, Lss, Gtr, Leq, Neq, Andk, Semi, Comma, Assign, Integerk, Stringk, Ident
constant all_syms = { "EOI", "Print", "Putc", "If", "While", "Lbrace", "Rbrace", "Lparen",
"Rparen", "Uminus", "Mul", "Div", "Add", "Sub", "Lss", "Gtr", "Leq", "Neq", "And", "Semi", "Comma", "Assign", "Integer", "String", "Ident"}
integer input_file, the_ch = ' ', the_col = 0, the_line = 1 sequence symbols map key_words = new()
procedure error(sequence format, sequence data)
printf(STDOUT, format, data) abort(1)
end procedure
-- get the next character from the input function next_ch()
the_ch = getc(input_file) the_col += 1 if the_ch = '\n' then the_line += 1 the_col = 0 end if return the_ch
end function
-- 'x' - character constants function char_lit(integer err_line, integer err_col)
integer n = next_ch() -- skip opening quote if the_ch = '\ then error("%d %d empty character constant", {err_line, err_col}) elsif the_ch = '\\' then next_ch() if the_ch = 'n' then n = 10 elsif the_ch = '\\' then n = '\\' else error("%d %d unknown escape sequence \\%c", {err_line, err_col, the_ch}) end if end if if next_ch() != '\ then error("%d %d multi-character constant", {err_line, err_col}) end if next_ch() return {Integerk, err_line, err_col, n}
end function
-- process divide or comments function div_or_cmt(integer err_line, integer err_col)
if next_ch() != '*' then return {Div, err_line, err_col} end if
-- comment found while true do if next_ch() = '*' and next_ch() = '/' then next_ch() return get_tok() elsif the_ch = EOF then error("%d %d EOF in comment", {err_line, err_col}) end if end while
end function
-- "string" function string_lit(integer start, integer err_line, integer err_col)
string text = ""
while next_ch() != start do if the_ch = EOF then error("%d %d EOF while scanning string literal", {err_line, err_col}) end if if the_ch = '\n' then error("%d %d EOL while scanning string literal", {err_line, err_col}) end if text &= the_ch end while
next_ch() return {Stringk, err_line, err_col, text}
end function
-- handle identifiers and integers function ident_or_int(integer err_line, integer err_col)
integer n, is_number = true string text = ""
while t_alnum(the_ch) or the_ch = '_' do text &= the_ch if not t_digit(the_ch) then is_number = false end if next_ch() end while
if length(text) = 0 then error("%d %d ident_or_int: unrecognized character: (%d) '%s'", {err_line, err_col, the_ch, the_ch}) end if
if t_digit(text[1]) then if not is_number then error("%d %d invalid number: %s", {err_line, err_col, text}) end if n = to_integer(text) return {Integerk, err_line, err_col, n} end if
if has(key_words, text) then return {get(key_words, text), err_line, err_col} end if
return {Ident, err_line, err_col, text}
end function
-- look ahead for '>=', etc. function follow(integer expect, integer ifyes, integer ifno, integer err_line, integer err_col)
if next_ch() = expect then next_ch() return {ifyes, err_line, err_col} end if
if ifno = EOI then error("%d %d follow: unrecognized character: (%d)", {err_line, err_col, the_ch}) end if
return {ifno, err_line, err_col}
end function
-- return the next token type function get_tok()
while t_space(the_ch) do next_ch() end while
integer err_line = the_line integer err_col = the_col
switch the_ch do case EOF then return {EOI, err_line, err_col} case '/' then return div_or_cmt(err_line, err_col) case '\ then return char_lit(err_line, err_col) case '<' then return follow('=', Leq, Lss, err_line, err_col) case '!' then return follow('=', Neq, EOI, err_line, err_col) case '&' then return follow('&', Andk, EOI, err_line, err_col) case '"' then return string_lit(the_ch, err_line, err_col) case else integer sym = symbols[the_ch] if sym != EOI then next_ch() return {sym, err_line, err_col} end if return ident_or_int(err_line, err_col) end switch
end function
procedure init()
put(key_words, "if", Ifk) put(key_words, "print", Printk) put(key_words, "putc", Putc) put(key_words, "while", Whilek)
symbols = repeat(EOI, 256) symbols['{'] = Lbrace symbols['}'] = Rbrace symbols['('] = Lparen symbols[')'] = Rparen symbols['+'] = Add symbols['-'] = Sub symbols['*'] = Mul symbols[';'] = Semi symbols[','] = Comma symbols['>'] = Gtr symbols['='] = Assign
end procedure
procedure main(sequence cl)
sequence file_name
input_file = STDIN if length(cl) > 2 then file_name = cl[3] input_file = open(file_name, "r") if input_file = -1 then error("Could not open %s", {file_name}) end if end if init() sequence t loop do t = get_tok() printf(STDOUT, "line %5d col %5d %-8s", {t[2], t[3], all_syms[t[1]]}) switch t[1] do case Integerk then printf(STDOUT, " %5d\n", {t[4]}) case Ident then printf(STDOUT, " %s\n", {t[4]}) case Stringk then printf(STDOUT, " \"%s\"\n", {t[4]}) case else printf(STDOUT, "\n") end switch until t[1] = EOI end loop
end procedure
main(command_line()) </lang>
FreeBASIC
<lang FreeBASIC> enum Token_type
tk_eoi tk_print tk_putc tk_if tk_while tk_lbrace tk_rbrace tk_lparen tk_rparen tk_uminus tk_mul tk_div tk_add tk_sub tk_lss tk_gtr tk_leq tk_neq tk_and tk_semi tk_comma tk_assign tk_integer tk_string tk_ident
end enum
const NewLine = chr(10) const DoubleQuote = chr(34)
' where we store keywords and variables type Symbol
s_name as string tok as Token_type
end type
dim shared symtab() as Symbol
dim shared cur_line as string dim shared cur_ch as string dim shared line_num as integer dim shared col_num as integer
function is_digit(byval ch as string) as long
is_digit = (ch <> "") and ch >= "0" and ch <= "9"
end function
function is_alnum(byval ch as string) as long
is_alnum = (ch <> "") and ((UCase(ch) >= "A" and UCase(ch) <= "Z") or (is_digit(ch)))
end function
sub error_msg(byval eline as integer, byval ecol as integer, byval msg as string)
print "("; eline; ":"; ecol; ")"; " "; msg system
end sub
' add an identifier to the symbol table function install(byval s_name as string, byval tok as Token_type) as integer
dim n as integer
n = ubound(symtab) redim preserve symtab(n + 1) n = ubound(symtab)
symtab(n).s_name = s_name symtab(n).tok = tok return n
end function
' search for an identifier in the symbol table function lookup(byval s_name as string) as integer
dim i as integer
for i = lbound(symtab) to ubound(symtab) if symtab(i).s_name = s_name then return i next return -1
end function
sub next_line() ' read the next line of input from the source file
cur_line = "" cur_ch = "" ' empty cur_ch means end-of-file if eof(1) then exit sub line input #1, cur_line cur_line = cur_line + NewLine line_num += + 1 col_num = 1
end sub
sub next_char() ' get the next char
cur_ch = "" col_num += 1 if col_num > len(cur_line) then next_line() if col_num <= len(cur_line) then cur_ch = mid(cur_line, col_num, 1)
end sub
function follow(byval err_line as integer, byval err_col as integer, byval expect as string, byval ifyes as Token_type, byval ifno as Token_type) as Token_type
if cur_ch = expect then next_char() return ifyes end if if ifno = tk_eoi then error_msg(err_line, err_col, "follow unrecognized character: " + cur_ch) return ifno
end function
sub gettok(byref err_line as integer, byref err_col as integer, byref tok as Token_type, byref v as string)
' skip whitespace do while (cur_ch = " " or cur_ch = chr(9) or cur_ch = NewLine) and (cur_ch <> "") next_char() loop
err_line = line_num err_col = col_num
select case cur_ch case "": tok = tk_eoi: exit sub case "{": tok = tk_lbrace: next_char(): exit sub case "}": tok = tk_rbrace: next_char(): exit sub case "(": tok = tk_lparen: next_char(): exit sub case ")": tok = tk_rparen: next_char(): exit sub case "+": tok = tk_add: next_char(): exit sub case "-": tok = tk_sub: next_char(): exit sub case "*": tok = tk_mul: next_char(): exit sub case ";": tok = tk_semi: next_char(): exit sub case ",": tok = tk_comma: next_char(): exit sub case ">": tok = tk_gtr: next_char(): exit sub case "=": tok = tk_assign: next_char(): exit sub case "/": ' div or comment next_char() if cur_ch <> "*" then tok = tk_div exit sub end if ' skip comments do next_char() if cur_ch = "*" or cur_ch = "" then next_char() if cur_ch = "/" or cur_ch = "" then next_char() gettok(err_line, err_col, tok, v) exit sub end if end if loop case "'": ' single char literals next_char() v = str(Asc(cur_ch)) if cur_ch = "'" then error_msg(err_line, err_col, "empty character constant") if cur_ch = "\" then next_char() if cur_ch = "n" then v = "10" elseif cur_ch = "\" then v = Str(Asc("\")) else error_msg(err_line, err_col, "unknown escape sequence: " + cur_ch) end if end if next_char() if cur_ch <> "'" then error_msg(err_line, err_col, "multi-character constant") next_char() tok = tk_integer exit sub case "<": next_char(): tok = follow(err_line, err_col, "=", tk_Leq, tk_Lss): exit sub case "!": next_char(): tok = follow(err_line, err_col, "=", tk_Neq, tk_EOI): exit sub case "&": next_char(): tok = follow(err_line, err_col, "&", tk_And, tk_EOI): exit sub case DoubleQuote: ' string v = cur_ch next_char() do while cur_ch <> DoubleQuote if cur_ch = NewLine then error_msg(err_line, err_col, "EOL in string") if cur_ch = "" then error_msg(err_line, err_col, "EOF in string") v += cur_ch next_char() loop v += cur_ch next_char() tok = tk_string exit sub case else ' integers or identifiers dim is_number as boolean = is_digit(cur_ch) v = "" do while is_alnum(cur_ch) orelse cur_ch = "_" if not is_digit(cur_ch) then is_number = false v += cur_ch next_char() loop if len(v) = 0 then error_msg(err_line, err_col, "unknown character: " + cur_ch) if is_digit(mid(v, 1, 1)) then if not is_number then error_msg(err_line, err_col, "invalid number: " + v) tok = tk_integer exit sub end if dim as integer index = lookup(v) if index = -1 then tok = tk_ident else tok = symtab(index).tok end if exit sub end select
end sub
sub init_lex(byval filein as string)
install("if", tk_if) install("print", tk_print) install("putc", tk_putc) install("while", tk_while)
open filein for input as #1
cur_line = "" line_num = 0 col_num = 0 next_char()
end sub
sub scanner()
dim err_line as integer dim err_col as integer dim tok as Token_type dim v as string dim tok_list(tk_eoi to tk_ident) as string
tok_list(tk_eoi ) = "EOI" tok_list(tk_print ) = "Print" tok_list(tk_putc ) = "Putc" tok_list(tk_if ) = "If" tok_list(tk_while ) = "While" tok_list(tk_lbrace ) = "Lbrace" tok_list(tk_rbrace ) = "Rbrace" tok_list(tk_lparen ) = "Lparen" tok_list(tk_rparen ) = "Rparen" tok_list(tk_uminus ) = "Uminus" tok_list(tk_mul ) = "Mul" tok_list(tk_div ) = "Div" tok_list(tk_add ) = "Add" tok_list(tk_sub ) = "Sub" tok_list(tk_lss ) = "Lss" tok_list(tk_gtr ) = "Gtr" tok_list(tk_leq ) = "Leq" tok_list(tk_neq ) = "Neq" tok_list(tk_and ) = "And" tok_list(tk_semi ) = "Semi" tok_list(tk_comma ) = "Comma" tok_list(tk_assign ) = "Assign" tok_list(tk_integer) = "Integer" tok_list(tk_string ) = "String" tok_list(tk_ident ) = "Ident"
do gettok(err_line, err_col, tok, v) print using "line ##### col ##### \ \"; err_line; err_col; tok_list(tok); if tok = tk_integer orelse tok = tk_ident orelse tok = tk_string then print " " + v; print loop until tok = tk_eoi
end sub
sub main()
if command(1) = "" then print "filename required" : system init_lex(command(1)) scanner()
end sub
main() system </lang>
Python
<lang Python> from __future__ import print_function import sys
- following two must remain in the same order
EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, \ Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident = range(25)
all_syms = [ 'EOI', 'Print', 'Putc', 'If', 'While', 'Lbrace', 'Rbrace', 'Lparen',
'Rparen', 'Uminus', 'Mul', 'Div', 'Add', 'Sub', 'Lss', 'Gtr', 'Leq', 'Neq', 'And', 'Semi', 'Comma', 'Assign', 'Integer', 'String', 'Ident' ]
- single character only symbols
symbols = { '{': Lbrace, '}': Rbrace, '(': Lparen, ')': Rparen, '+': Add, '-': Sub,
'*': Mul, ';': Semi, ',': Comma, '>': Gtr, '=': Assign }
key_words = { 'if': If, 'print': Print, 'putc': Putc, 'while': While }
the_ch = " " # dummy first char - but it must be a space the_col = 0 the_line = 1 input_file = None
- show error and exit
def error(line, col, msg):
print(line, col, msg) exit(1)
- get the next character from the input
def next_ch():
global the_ch, the_col, the_line
the_ch = input_file.read(1) the_col += 1 if the_ch == '\n': the_line += 1 the_col = 0 return the_ch
- 'x' - character constants
def char_lit(err_line, err_col):
n = ord(next_ch()) # skip opening quote if the_ch == '\: error(err_line, err_col, "empty character constant") elif the_ch == '\\': next_ch() if the_ch == 'n': n = 10 elif the_ch == '\\': n = '\\' else: error(err_line, err_col, "unknown escape sequence \\%c" % (the_ch)) if next_ch() != '\: error(err_line, err_col, "multi-character constant") next_ch() return Integerk, err_line, err_col, n
- process divide or comments
def div_or_cmt(err_line, err_col):
if next_ch() != '*': return Div, err_line, err_col
# comment found while True: if next_ch() == '*' and next_ch() == '/': next_ch() return gettok() elif len(the_ch) == 0: error(err_line, err_col, "EOF in comment")
- "string"
def string_lit(start, err_line, err_col):
text = ""
while next_ch() != start: if len(the_ch) == 0: error(err_line, err_col, "EOF while scanning string literal") if the_ch == '\n': error(err_line, err_col, "EOL while scanning string literal") text += the_ch
next_ch() return Stringk, err_line, err_col, text
- handle identifiers and integers
def ident_or_int(err_line, err_col):
is_number = True text = ""
while the_ch.isalnum() or the_ch == '_': text += the_ch if not the_ch.isdigit(): is_number = False next_ch()
if len(text) == 0: error(err_line, err_col, "ident_or_int: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))
if text[0].isdigit(): if not is_number: error(err_line, err_col, "invalid number: %s" % (text)) n = int(text) return Integerk, err_line, err_col, n
if text in key_words: return key_words[text], err_line, err_col
return Ident, err_line, err_col, text
- look ahead for '>=', etc.
def follow(expect, ifyes, ifno, err_line, err_col):
if next_ch() == expect: next_ch() return ifyes, err_line, err_col
if ifno == EOI: error(err_line, err_col, "follow: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))
return ifno, err_line, err_col
- return the next token type
def gettok():
while the_ch.isspace(): next_ch()
err_line = the_line err_col = the_col
if len(the_ch) == 0: return EOI, err_line, err_col elif the_ch == '/': return div_or_cmt(err_line, err_col) elif the_ch == '\: return char_lit(err_line, err_col) elif the_ch == '<': return follow('=', Leq, Lss, err_line, err_col) elif the_ch == '!': return follow('=', Neq, EOI, err_line, err_col) elif the_ch == '&': return follow('&', And, EOI, err_line, err_col) elif the_ch == '"': return string_lit(the_ch, err_line, err_col) elif the_ch in symbols: sym = symbols[the_ch] next_ch() return sym, err_line, err_col else: return ident_or_int(err_line, err_col)
- main driver
input_file = sys.stdin if len(sys.argv) > 1:
try: input_file = open(sys.argv[1], "r", 4096) except IOError as e: error(0, 0, "Can't open %s" % sys.argv[1])
while True:
t = gettok() tok = t[0] line = t[1] col = t[2]
print("line %5d col %5d %-8s" % (line, col, all_syms[tok]), end=)
if tok == Integerk: print(" %5d" % (t[3])) elif tok == Ident: print(" %s" % (t[3])) elif tok == Stringk: print(' "%s"' % (t[3])) else: print("")
if tok == EOI: break
</lang>