Ed Davis

Joined 31 August 2022
m
Replaced content with "Hello, World!"
No edit summary
m (Replaced content with "Hello, World!")
 
(One intermediate revision by the same user not shown)
Line 1:
Hello, World!
Lexical analysis is the process of converting a sequence of characters (such as in a
computer program or web page) into a sequence of tokens (strings with an identified
"meaning"). A program that performs lexical analysis may be called a lexer, tokenizer,
or scanner (though "scanner" is also used to refer to the first stage of a lexer).
 
;The Task
 
Create a lexical analyzer for the Tiny programming language. The
program should read input from a file and/or stdin, and write
output to a file and/or stdout.
 
;Specification
 
The various token types are denoted below.
 
;Operators
 
{| class="wikitable"
|-
! Characters !! Common name !! Name
|-
| * || multiply || Mul
|-
| / || divide || Div
|-
| + || plus || Add
|-
| - || minus and unary minus || Sub and Uminus
|-
| < || less than || Lss
|-
| <= || less than or equal || Leq
|-
| > || greater than || Gtr
|-
| != || not equal || Neq
|-
| = || assign || Assign
|-
| && || and || And
|}
 
;Symbols
 
{| class="wikitable"
|-
! Characters !! Common name !! Name
|-
| ( || left parenthesis || Lparen
|-
| ) || right parenthesis || Rparen
|-
| { || left brace || Lbrace
|-
| } || right brace || Rbrace
|-
| ; || semi colon || Semi
|-
| , || comma || Comma
|}
 
;Keywords
 
{| class="wikitable"
|-
! Characters !! Name
|-
| if || If
|-
| while || While
|-
| print || Print
|-
| putc || Putc
|}
 
;Other entities
 
{| class="wikitable"
|-
! Characters !! Regular expression !! Name
|-
| integers || [0-9]+ || Integer
|-
| char literal || 'x' || Integer
|-
| identifiers || [_a-zA-Z][_a-zA-Z0-9]+ || Ident
|-
| string literal || ".*" || String
|}
 
Notes: For char literals, '\n' is supported as a new line
character. To represent \, use: '\\'. \n may also be used in
Strings, to print a newline. No other special sequences are
supported.
 
'''Comments''' /* ... */ (multi-line)
 
;Complete list of token names
 
'''EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident'''
 
;Program output
 
Output of the program should be:
 
* the word line, followed by:
* the line number where the token starts, followed by:
* the abbreviation col, followed by:
* the column number where the token starts, followed by:
* the token name.
* If the token name is one of Integer, Ident or String, the actual value of the same should follow.
 
;Test Cases
 
<lang c>
/*
Hello world
*/
print("Hello, World!\n");
</lang>
 
;Output
 
<b>
<pre>
line 4 col 1 Print
line 4 col 6 Lparen
line 4 col 7 String "Hello, World!\n"
line 4 col 24 Rparen
line 4 col 25 Semi
line 5 col 1 EOI
</pre>
</b>
 
<lang c>
/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, "\n");
</lang>
 
;Output
 
<b>
<pre>
line 4 col 1 Ident phoenix_number
line 4 col 16 Assign
line 4 col 18 Integer 142857
line 4 col 24 Semi
line 5 col 1 Print
line 5 col 6 Lparen
line 5 col 7 Ident phoenix_number
line 5 col 21 Comma
line 5 col 23 String "\n"
line 5 col 27 Rparen
line 5 col 28 Semi
line 6 col 1 EOI
</pre>
</b>
 
<lang c>
/*
All lexical tokens - not syntatically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ <
/* If */ if /* Gtr */ >
/* While */ while /* Leq */ <=
/* Lbrace */ { /* Neq */ !=
/* Rbrace */ } /* And */ &&
/* Lparen */ ( /* Semi */ ;
/* Rparen */ ) /* Comma */ ,
/* Uminus */ - /* Assign */ =
/* Mul */ * /* Integer */ 42
/* Div */ / /* String */ "String literal"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\n'
/* character literal */ ' '
</lang>
 
;Output
 
<b>
<pre>
line 5 col 15 Print
line 5 col 41 Sub
line 6 col 15 Putc
line 6 col 41 Lss
line 7 col 15 If
line 7 col 41 Gtr
line 8 col 15 While
line 8 col 41 Leq
line 9 col 15 Lbrace
line 9 col 41 Neq
line 10 col 15 Rbrace
line 10 col 41 And
line 11 col 15 Lparen
line 11 col 41 Semi
line 12 col 15 Rparen
line 12 col 41 Comma
line 13 col 15 Sub
line 13 col 41 Assign
line 14 col 15 Mul
line 14 col 41 Integer 42
line 15 col 15 Div
line 15 col 41 String "String literal"
line 16 col 15 Add
line 16 col 41 Ident variable_name
line 17 col 26 Integer 10
line 18 col 26 Integer 32
line 19 col 1 EOI</pre>
</b>
 
;Diagnostics
The following error conditions should be caught:
 
* Empty character constant. Example: &apos;&apos;
* Unknown escape sequence. Example: '\r'
* Multi-character constant. Example: 'xx'
* End-of-file in comment. Closing comment characters not found.
* End-of-file while scanning string literal. Closing string character not found.
* End-of-line while scanning string literal. Closing string character not found before end-of-line.
* Unrecognized character. Example: |
 
;Reference
 
The C and Python versions can be considered reference implementations.
 
;Implementations
 
__TOC__
 
=={{header|C}}==
<lang C>
#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <ctype.h>
#include <string.h>
#include <errno.h>
#include <stdbool.h>
#include <limits.h>
 
#define NELEMS(arr) (sizeof(arr) / sizeof(arr[0]))
 
#define da_dim(name, type) type *name = NULL; \
int _qy_ ## name ## _p = 0; \
int _qy_ ## name ## _max = 0
#define da_rewind(name) _qy_ ## name ## _p = 0
#define da_redim(name) if (_qy_ ## name ## _p >= _qy_ ## name ## _max) \
name = realloc(name, (_qy_ ## name ## _max += 32) * sizeof(name[0]))
#define da_append(name, x) do {da_redim(name); name[_qy_ ## name ## _p++] = x;} while (0)
#define da_len(name) _qy_ ## name ## _p
 
// dependancy: atr table in parse.c ordering is based on these
typedef enum {
EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add,
Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident
} TokenType;
 
typedef struct {
int tok;
int err_ln, err_col;
union {
int n; /* value for constants */
char *text; /* text for idents */
};
} tok_s;
 
static FILE *source_fp, *dest_fp;
static int line = 1, col = 0, the_ch = ' ';
da_dim(text, char);
 
tok_s gettok();
 
static void error(int err_line, int err_col, const char *fmt, ... ) {
char buf[1000];
va_list ap;
 
va_start(ap, fmt);
vsprintf(buf, fmt, ap);
va_end(ap);
printf("(%d,%d) error: %s\n", err_line, err_col, buf);
exit(1);
}
 
static void read_ch() { /* get next char from input */
the_ch = getc(source_fp);
++col;
if (the_ch == '\n') {
++line;
col = 0;
}
}
 
static tok_s char_lit(int n, int err_line, int err_col) { /* 'x' */
if (the_ch == '\'')
error(err_line, err_col, "gettok: empty character constant");
if (the_ch == '\\') {
read_ch();
if (the_ch == 'n')
n = 10;
else if (the_ch == '\\')
n = '\\';
else error(err_line, err_col, "gettok: unknown escape sequence \\%c", the_ch);
}
read_ch();
if (the_ch != '\'') error(err_line, err_col, "multi-character constant");
read_ch();
return (tok_s){Integerk, err_line, err_col, {n}};
}
 
static tok_s div_or_cmt(int err_line, int err_col) { /* process divide or comments */
if (the_ch != '*')
return (tok_s){Div, err_line, err_col, {0}};
 
/* comment found */
for (;;) {
read_ch();
if (the_ch == '*' || the_ch == EOF) {
read_ch();
if (the_ch == '/' || the_ch == EOF) {
read_ch();
return gettok();
}
}
}
}
 
static tok_s string_lit(int start, int err_line, int err_col) { /* "st" */
da_rewind(text);
 
for (read_ch(); the_ch != start; read_ch()) {
if (the_ch == '\n')
error(err_line, err_col, "EOL in string");
if (the_ch == EOF)
error(err_line, err_col, "EOF in string");
da_append(text, (char)the_ch);
}
da_append(text, '\0');
 
read_ch();
return (tok_s){Stringk, err_line, err_col, {.text=text}};
}
 
static int kwd_cmp(const void *p1, const void *p2) {
return strcmp(*(char **)p1, *(char **)p2);
}
 
static TokenType get_ident_type(const char *ident) {
static struct {
char *s;
TokenType sym;
} kwds[] = {
{"if", If},
{"print", Print},
{"putc", Putc},
{"while", While},
}, *kwp;
 
return (kwp = bsearch(&ident, kwds, NELEMS(kwds), sizeof(kwds[0]), kwd_cmp)) == NULL ? Ident : kwp->sym;
}
 
static tok_s ident_or_int(int err_line, int err_col) {
int n, is_number = true;
 
da_rewind(text);
while (isalnum(the_ch) || the_ch == '_') {
da_append(text, (char)the_ch);
if (!isdigit(the_ch))
is_number = false;
read_ch();
}
if (da_len(text) == 0)
error(err_line, err_col, "gettok: unrecognized character (%d) '%c'\n", the_ch, the_ch);
da_append(text, '\0');
if (isdigit(text[0])) {
if (!is_number)
error(err_line, err_col, "invalid number: %s\n", text);
n = strtol(text, NULL, 0);
if (n == LONG_MAX && errno == ERANGE)
error(err_line, err_col, "Number exceeds maximum value");
return (tok_s){Integerk, err_line, err_col, {n}};
}
return (tok_s){get_ident_type(text), err_line, err_col, {.text=text}};
}
 
static tok_s follow(int expect, TokenType ifyes, TokenType ifno, int err_line, int err_col) { /* look ahead for '>=', etc. */
if (the_ch == expect) {
read_ch();
return (tok_s){ifyes, err_line, err_col, {0}};
}
if (ifno == EOI) error(err_line, err_col, "follow: unrecognized character '%c' (%d)\n", the_ch, the_ch);
return (tok_s){ifno, err_line, err_col, {0}};
}
 
tok_s gettok() { /* return the token type */
/* skip white space */
while (isspace(the_ch))
read_ch();
int err_line = line;
int err_col = col;
switch (the_ch) {
case '{': read_ch(); return (tok_s){Lbrace, err_line, err_col, {0}};
case '}': read_ch(); return (tok_s){Rbrace, err_line, err_col, {0}};
case '(': read_ch(); return (tok_s){Lparen, err_line, err_col, {0}};
case ')': read_ch(); return (tok_s){Rparen, err_line, err_col, {0}};
case '+': read_ch(); return (tok_s){Add, err_line, err_col, {0}};
case '-': read_ch(); return (tok_s){Sub, err_line, err_col, {0}};
case '*': read_ch(); return (tok_s){Mul, err_line, err_col, {0}};
case ';': read_ch(); return (tok_s){Semi, err_line, err_col, {0}};
case ',': read_ch(); return (tok_s){Comma, err_line, err_col, {0}};
case '>': read_ch(); return (tok_s){Gtr, err_line, err_col, {0}};
case '=': read_ch(); return (tok_s){Assign, err_line, err_col, {0}};
case '/': read_ch(); return div_or_cmt(err_line, err_col);
case '\'': read_ch(); return char_lit(the_ch, err_line, err_col);
case '<': read_ch(); return follow('=', Leq, Lss, err_line, err_col);
case '!': read_ch(); return follow('=', Neq, EOI, err_line, err_col);
case '&': read_ch(); return follow('&', And, EOI, err_line, err_col);
case '"' : return string_lit(the_ch, err_line, err_col);
default: return ident_or_int(err_line, err_col);
case EOF: return (tok_s){EOI, err_line, err_col, {0}};
}
}
 
void run() { /* tokenize the given input */
tok_s tok;
do {
tok = gettok();
fprintf(dest_fp, "line %5d col %5d %.8s",
tok.err_ln, tok.err_col,
&"EOI Print Putc If While Lbrace Rbrace Lparen Rparen "
"Uminus Mul Div Add Sub Lss Gtr Leq Neq "
"And Semi Comma Assign Integer String Ident "[tok.tok * 9]);
if (tok.tok == Integerk)
fprintf(dest_fp, " %8d", tok.n);
else if (tok.tok == Ident)
fprintf(dest_fp, " %s", tok.text);
else if (tok.tok == Stringk)
fprintf(dest_fp, " \"%s\"", tok.text);
fprintf(dest_fp, "\n");
} while (tok.tok != EOI);
if (dest_fp != stdout)
fclose(dest_fp);
}
 
void init_io(FILE **fp, FILE *std, const char mode[], const char fn[]) {
if (fn[0] == '\0')
*fp = std;
else if ((*fp = fopen(fn, mode)) == NULL)
error(0, 0, "Can't open %s\n", fn);
}
 
int main(int argc, char *argv[]) {
init_io(&source_fp, stdin, "r", argc > 1 ? argv[1] : "");
init_io(&dest_fp, stdout, "wb", argc > 2 ? argv[2] : "");
run();
}
</lang>
 
=={{header|FreeBASIC}}==
<lang FreeBASIC>
enum Token_type
tk_eoi
tk_print
tk_putc
tk_if
tk_while
tk_lbrace
tk_rbrace
tk_lparen
tk_rparen
tk_uminus
tk_mul
tk_div
tk_add
tk_sub
tk_lss
tk_gtr
tk_leq
tk_neq
tk_and
tk_semi
tk_comma
tk_assign
tk_integer
tk_string
tk_ident
end enum
 
const NewLine = chr(10)
const DoubleQuote = chr(34)
 
' where we store keywords and variables
type Symbol
s_name as string
tok as Token_type
end type
 
dim shared symtab() as Symbol
 
dim shared cur_line as string
dim shared cur_ch as string
dim shared line_num as integer
dim shared col_num as integer
 
function is_digit(byval ch as string) as long
is_digit = (ch <> "") and ch >= "0" and ch <= "9"
end function
 
function is_alnum(byval ch as string) as long
is_alnum = (ch <> "") and ((UCase(ch) >= "A" and UCase(ch) <= "Z") or (is_digit(ch)))
end function
 
sub error_msg(byval eline as integer, byval ecol as integer, byval msg as string)
print "("; eline; ":"; ecol; ")"; " "; msg
system
end sub
 
' add an identifier to the symbol table
function install(byval s_name as string, byval tok as Token_type) as integer
dim n as integer
 
n = ubound(symtab)
redim preserve symtab(n + 1)
n = ubound(symtab)
 
symtab(n).s_name = s_name
symtab(n).tok = tok
return n
end function
 
' search for an identifier in the symbol table
function lookup(byval s_name as string) as integer
dim i as integer
 
for i = lbound(symtab) to ubound(symtab)
if symtab(i).s_name = s_name then return i
next
return -1
end function
 
sub next_line() ' read the next line of input from the source file
cur_line = ""
cur_ch = "" ' empty cur_ch means end-of-file
if eof(1) then exit sub
line input #1, cur_line
cur_line = cur_line + NewLine
line_num += + 1
col_num = 1
end sub
 
sub next_char() ' get the next char
cur_ch = ""
col_num += 1
if col_num > len(cur_line) then next_line()
if col_num <= len(cur_line) then cur_ch = mid(cur_line, col_num, 1)
end sub
 
function follow(byval err_line as integer, byval err_col as integer, byval expect as string, byval ifyes as Token_type, byval ifno as Token_type) as Token_type
if cur_ch = expect then
next_char()
return ifyes
end if
if ifno = tk_eoi then error_msg(err_line, err_col, "follow unrecognized character: " + cur_ch)
return ifno
end function
 
sub gettok(byref err_line as integer, byref err_col as integer, byref tok as Token_type, byref v as string)
' skip whitespace
do while (cur_ch = " " or cur_ch = chr(9) or cur_ch = NewLine) and (cur_ch <> "")
next_char()
loop
 
err_line = line_num
err_col = col_num
 
select case cur_ch
case "": tok = tk_eoi: exit sub
case "{": tok = tk_lbrace: next_char(): exit sub
case "}": tok = tk_rbrace: next_char(): exit sub
case "(": tok = tk_lparen: next_char(): exit sub
case ")": tok = tk_rparen: next_char(): exit sub
case "+": tok = tk_add: next_char(): exit sub
case "-": tok = tk_sub: next_char(): exit sub
case "*": tok = tk_mul: next_char(): exit sub
case ";": tok = tk_semi: next_char(): exit sub
case ",": tok = tk_comma: next_char(): exit sub
case ">": tok = tk_gtr: next_char(): exit sub
case "=": tok = tk_assign: next_char(): exit sub
case "/": ' div or comment
next_char()
if cur_ch <> "*" then
tok = tk_div
exit sub
end if
' skip comments
do
next_char()
if cur_ch = "*" or cur_ch = "" then
next_char()
if cur_ch = "/" or cur_ch = "" then
next_char()
gettok(err_line, err_col, tok, v)
exit sub
end if
end if
loop
case "'": ' single char literals
next_char()
v = str(Asc(cur_ch))
if cur_ch = "'" then error_msg(err_line, err_col, "empty character constant")
if cur_ch = "\" then
next_char()
if cur_ch = "n" then
v = "10"
elseif cur_ch = "\" then
v = Str(Asc("\"))
else error_msg(err_line, err_col, "unknown escape sequence: " + cur_ch)
end if
end if
next_char()
if cur_ch <> "'" then error_msg(err_line, err_col, "multi-character constant")
next_char()
tok = tk_integer
exit sub
case "<": next_char(): tok = follow(err_line, err_col, "=", tk_Leq, tk_Lss): exit sub
case "!": next_char(): tok = follow(err_line, err_col, "=", tk_Neq, tk_EOI): exit sub
case "&": next_char(): tok = follow(err_line, err_col, "&", tk_And, tk_EOI): exit sub
case DoubleQuote: ' string
v = cur_ch
next_char()
do while cur_ch <> DoubleQuote
if cur_ch = NewLine then error_msg(err_line, err_col, "EOL in string")
if cur_ch = "" then error_msg(err_line, err_col, "EOF in string")
v += cur_ch
next_char()
loop
v += cur_ch
next_char()
tok = tk_string
exit sub
case else ' integers or identifiers
dim is_number as boolean = is_digit(cur_ch)
v = ""
do while is_alnum(cur_ch) orelse cur_ch = "_"
if not is_digit(cur_ch) then is_number = false
v += cur_ch
next_char()
loop
if len(v) = 0 then error_msg(err_line, err_col, "unknown character: " + cur_ch)
if is_digit(mid(v, 1, 1)) then
if not is_number then error_msg(err_line, err_col, "invalid number: " + v)
tok = tk_integer
exit sub
end if
dim as integer index = lookup(v)
if index = -1 then
tok = tk_ident
else
tok = symtab(index).tok
end if
exit sub
end select
end sub
 
sub init_lex(byval filein as string)
install("if", tk_if)
install("print", tk_print)
install("putc", tk_putc)
install("while", tk_while)
 
open filein for input as #1
 
cur_line = ""
line_num = 0
col_num = 0
next_char()
end sub
 
sub scanner()
dim err_line as integer
dim err_col as integer
dim tok as Token_type
dim v as string
dim tok_list(tk_eoi to tk_ident) as string
 
tok_list(tk_eoi ) = "EOI"
tok_list(tk_print ) = "Print"
tok_list(tk_putc ) = "Putc"
tok_list(tk_if ) = "If"
tok_list(tk_while ) = "While"
tok_list(tk_lbrace ) = "Lbrace"
tok_list(tk_rbrace ) = "Rbrace"
tok_list(tk_lparen ) = "Lparen"
tok_list(tk_rparen ) = "Rparen"
tok_list(tk_uminus ) = "Uminus"
tok_list(tk_mul ) = "Mul"
tok_list(tk_div ) = "Div"
tok_list(tk_add ) = "Add"
tok_list(tk_sub ) = "Sub"
tok_list(tk_lss ) = "Lss"
tok_list(tk_gtr ) = "Gtr"
tok_list(tk_leq ) = "Leq"
tok_list(tk_neq ) = "Neq"
tok_list(tk_and ) = "And"
tok_list(tk_semi ) = "Semi"
tok_list(tk_comma ) = "Comma"
tok_list(tk_assign ) = "Assign"
tok_list(tk_integer ) = "Integer"
tok_list(tk_string ) = "String"
tok_list(tk_ident ) = "Ident"
 
do
gettok(err_line, err_col, tok, v)
print using "line ##### col ##### \ \"; err_line; err_col; tok_list(tok);
if tok = tk_integer orelse tok = tk_ident orelse tok = tk_string then print " " + v;
print
loop until tok = tk_eoi
end sub
 
sub main()
if command(1) = "" then print "filename required" : system
init_lex(command(1))
scanner()
end sub
 
main()
system
</lang>
 
=={{header|Python}}==
<lang Python>
import sys
 
# following two must remain in the same order
EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, \
Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident = range(25)
 
all_syms = [ 'EOI', 'Print', 'Putc', 'If', 'While', 'Lbrace', 'Rbrace', 'Lparen',
'Rparen', 'Uminus', 'Mul', 'Div', 'Add', 'Sub', 'Lss', 'Gtr', 'Leq', 'Neq', 'And',
'Semi', 'Comma', 'Assign', 'Integer', 'String', 'Ident' ]
 
# single character only symbols
symbols = { '{': Lbrace, '}': Rbrace, '(': Lparen, ')': Rparen, '+': Add, '-': Sub,
'*': Mul, ';': Semi, ',': Comma, '>': Gtr, '=': Assign }
 
key_words = { 'if': If, 'print': Print, 'putc': Putc, 'while': While }
 
the_ch = " " # dummy first char - but it must be a space
the_col = 0
the_line = 1
input_file = None
 
#*** show error and exit
def error(line, col, msg):
print(line, col, msg)
exit(1)
 
#*** get the next character from the input
def getc():
global the_ch, the_col, the_line
 
the_ch = input_file.read(1)
the_col += 1
if the_ch == '\n':
the_line += 1
the_col = 0
return the_ch
 
#*** 'x' - character constants
def char_lit(err_line, err_col):
n = ord(getc()) # skip opening quote
if the_ch == '\'':
error(err_line, err_col, "empty character constant")
elif the_ch == '\\':
getc()
if the_ch == 'n':
n = 10
elif the_ch == '\\':
n = '\\'
else:
error(err_line, err_col, "unknown escape sequence \\%c" % (the_ch))
if getc() != '\'':
error(err_line, err_col, "multi-character constant")
getc()
return Integerk, err_line, err_col, n
 
#*** process divide or comments
def div_or_cmt(err_line, err_col):
if getc() != '*':
return Div, err_line, err_col
 
# comment found
while True:
if getc() == '*' and getc() == '/':
getc()
return gettok()
elif len(the_ch) == 0:
error(err_line, err_col, "EOF in comment")
 
#*** "string"
def string_lit(start, err_line, err_col):
text = ""
 
while getc() != start:
if len(the_ch) == 0:
error(err_line, err_col, "EOF while scanning string literal")
if the_ch == '\n':
error(err_line, err_col, "EOL while scanning string literal")
text += the_ch
 
getc()
return Stringk, err_line, err_col, text
 
#*** handle identifiers and integers
def ident_or_int(err_line, err_col):
is_number = True
text = ""
 
while the_ch.isalnum() or the_ch == '_':
text += the_ch
if not the_ch.isdigit():
is_number = False
getc()
 
if len(text) == 0:
error(err_line, err_col, "ident_or_int: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))
 
if text[0].isdigit():
if not is_number:
error(err_line, err_col, "invalid number: %s" % (text))
n = int(text)
return Integerk, err_line, err_col, n
 
if text in key_words:
return key_words[text], err_line, err_col
 
return Ident, err_line, err_col, text
 
#*** look ahead for '>=', etc.
def follow(expect, ifyes, ifno, err_line, err_col):
if getc() == expect:
getc()
return ifyes, err_line, err_col
 
if ifno == EOI:
error(err_line, err_col, "follow: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))
 
return ifno, err_line, err_col
 
#*** return the next token type
def gettok():
while the_ch.isspace():
getc()
 
err_line = the_line
err_col = the_col
 
if len(the_ch) == 0: return EOI, err_line, err_col
elif the_ch in symbols: sym = symbols[the_ch]; getc(); return sym, err_line, err_col
elif the_ch == '/': return div_or_cmt(err_line, err_col)
elif the_ch == '\'': return char_lit(err_line, err_col)
elif the_ch == '<': return follow('=', Leq, Lss, err_line, err_col)
elif the_ch == '!': return follow('=', Neq, EOI, err_line, err_col)
elif the_ch == '&': return follow('&', And, EOI, err_line, err_col)
elif the_ch == '"': return string_lit(the_ch, err_line, err_col)
else: return ident_or_int(err_line, err_col)
 
#*** main driver
input_file = sys.stdin
if len(sys.argv) > 1:
try:
input_file = open(sys.argv[1], "r", 4096)
except IOError as e:
error(0, 0, "Can't open %s" % sys.argv[1])
 
while True:
t = gettok()
tok = t[0]
line = t[1]
col = t[2]
 
if tok == Integerk:
print("line %5d col %5d %-8s %8d" % (line, col, all_syms[tok], t[3]))
elif tok == Ident:
print("line %5d col %5d %-8s %s" % (line, col, all_syms[tok], t[3]))
elif tok == Stringk:
print('line %5d col %5d %-8s "%s"' % (line, col, all_syms[tok], t[3]))
else:
print("line %5d col %5d %-8s" % (line, col, all_syms[tok]))
 
if tok == EOI:
break
</lang>
155

edits