User:Ed Davis: Difference between revisions
Content added Content deleted
No edit summary |
m (Replaced content with "Hello, World!") |
||
Line 1: | Line 1: | ||
Hello, World! |
|||
Lexical analysis is the process of converting a sequence of characters (such as in a |
|||
computer program or web page) into a sequence of tokens (strings with an identified |
|||
"meaning"). A program that performs lexical analysis may be called a lexer, tokenizer, |
|||
or scanner (though "scanner" is also used to refer to the first stage of a lexer). |
|||
;The Task |
|||
Create a lexical analyzer for the Tiny programming language. The |
|||
program should read input from a file and/or stdin, and write |
|||
output to a file and/or stdout. |
|||
;Specification |
|||
The various token types are denoted below. |
|||
;Operators |
|||
{| class="wikitable" |
|||
|- |
|||
! Characters !! Common name !! Name |
|||
|- |
|||
| * || multiply || Mul |
|||
|- |
|||
| / || divide || Div |
|||
|- |
|||
| + || plus || Add |
|||
|- |
|||
| - || minus and unary minus || Sub and Uminus |
|||
|- |
|||
| < || less than || Lss |
|||
|- |
|||
| <= || less than or equal || Leq |
|||
|- |
|||
| > || greater than || Gtr |
|||
|- |
|||
| != || not equal || Neq |
|||
|- |
|||
| = || assign || Assign |
|||
|- |
|||
| && || and || And |
|||
|} |
|||
;Symbols |
|||
{| class="wikitable" |
|||
|- |
|||
! Characters !! Common name !! Name |
|||
|- |
|||
| ( || left parenthesis || Lparen |
|||
|- |
|||
| ) || right parenthesis || Rparen |
|||
|- |
|||
| { || left brace || Lbrace |
|||
|- |
|||
| } || right brace || Rbrace |
|||
|- |
|||
| ; || semi colon || Semi |
|||
|- |
|||
| , || comma || Comma |
|||
|} |
|||
;Keywords |
|||
{| class="wikitable" |
|||
|- |
|||
! Characters !! Name |
|||
|- |
|||
| if || If |
|||
|- |
|||
| while || While |
|||
|- |
|||
| print || Print |
|||
|- |
|||
| putc || Putc |
|||
|} |
|||
;Other entities |
|||
{| class="wikitable" |
|||
|- |
|||
! Characters !! Regular expression !! Name |
|||
|- |
|||
| integers || [0-9]+ || Integer |
|||
|- |
|||
| char literal || 'x' || Integer |
|||
|- |
|||
| identifiers || [_a-zA-Z][_a-zA-Z0-9]+ || Ident |
|||
|- |
|||
| string literal || ".*" || String |
|||
|} |
|||
Notes: For char literals, '\n' is supported as a new line |
|||
character. To represent \, use: '\\'. \n may also be used in |
|||
Strings, to print a newline. No other special sequences are |
|||
supported. |
|||
'''Comments''' /* ... */ (multi-line) |
|||
;Complete list of token names |
|||
'''EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident''' |
|||
;Program output |
|||
Output of the program should be: |
|||
* the word line, followed by: |
|||
* the line number where the token starts, followed by: |
|||
* the abbreviation col, followed by: |
|||
* the column number where the token starts, followed by: |
|||
* the token name. |
|||
* If the token name is one of Integer, Ident or String, the actual value of the same should follow. |
|||
;Test Cases |
|||
<lang c> |
|||
/* |
|||
Hello world |
|||
*/ |
|||
print("Hello, World!\n"); |
|||
</lang> |
|||
;Output |
|||
<b> |
|||
<pre> |
|||
line 4 col 1 Print |
|||
line 4 col 6 Lparen |
|||
line 4 col 7 String "Hello, World!\n" |
|||
line 4 col 24 Rparen |
|||
line 4 col 25 Semi |
|||
line 5 col 1 EOI |
|||
</pre> |
|||
</b> |
|||
<lang c> |
|||
/* |
|||
Show Ident and Integers |
|||
*/ |
|||
phoenix_number = 142857; |
|||
print(phoenix_number, "\n"); |
|||
</lang> |
|||
;Output |
|||
<b> |
|||
<pre> |
|||
line 4 col 1 Ident phoenix_number |
|||
line 4 col 16 Assign |
|||
line 4 col 18 Integer 142857 |
|||
line 4 col 24 Semi |
|||
line 5 col 1 Print |
|||
line 5 col 6 Lparen |
|||
line 5 col 7 Ident phoenix_number |
|||
line 5 col 21 Comma |
|||
line 5 col 23 String "\n" |
|||
line 5 col 27 Rparen |
|||
line 5 col 28 Semi |
|||
line 6 col 1 EOI |
|||
</pre> |
|||
</b> |
|||
<lang c> |
|||
/* |
|||
All lexical tokens - not syntactically correct, but that will |
|||
have to wait until syntax analysis |
|||
*/ |
|||
/* Print */ print /* Sub */ - |
|||
/* Putc */ putc /* Lss */ < |
|||
/* If */ if /* Gtr */ > |
|||
/* While */ while /* Leq */ <= |
|||
/* Lbrace */ { /* Neq */ != |
|||
/* Rbrace */ } /* And */ && |
|||
/* Lparen */ ( /* Semi */ ; |
|||
/* Rparen */ ) /* Comma */ , |
|||
/* Uminus */ - /* Assign */ = |
|||
/* Mul */ * /* Integer */ 42 |
|||
/* Div */ / /* String */ "String literal" |
|||
/* Add */ + /* Ident */ variable_name |
|||
/* character literal */ '\n' |
|||
/* character literal */ ' ' |
|||
</lang> |
|||
;Output |
|||
<b> |
|||
<pre> |
|||
line 5 col 15 Print |
|||
line 5 col 41 Sub |
|||
line 6 col 15 Putc |
|||
line 6 col 41 Lss |
|||
line 7 col 15 If |
|||
line 7 col 41 Gtr |
|||
line 8 col 15 While |
|||
line 8 col 41 Leq |
|||
line 9 col 15 Lbrace |
|||
line 9 col 41 Neq |
|||
line 10 col 15 Rbrace |
|||
line 10 col 41 And |
|||
line 11 col 15 Lparen |
|||
line 11 col 41 Semi |
|||
line 12 col 15 Rparen |
|||
line 12 col 41 Comma |
|||
line 13 col 15 Sub |
|||
line 13 col 41 Assign |
|||
line 14 col 15 Mul |
|||
line 14 col 41 Integer 42 |
|||
line 15 col 15 Div |
|||
line 15 col 41 String "String literal" |
|||
line 16 col 15 Add |
|||
line 16 col 41 Ident variable_name |
|||
line 17 col 26 Integer 10 |
|||
line 18 col 26 Integer 32 |
|||
line 19 col 1 EOI</pre> |
|||
</b> |
|||
;Diagnostics |
|||
The following error conditions should be caught: |
|||
* Empty character constant. Example: '' |
|||
* Unknown escape sequence. Example: '\r' |
|||
* Multi-character constant. Example: 'xx' |
|||
* End-of-file in comment. Closing comment characters not found. |
|||
* End-of-file while scanning string literal. Closing string character not found. |
|||
* End-of-line while scanning string literal. Closing string character not found before end-of-line. |
|||
* Unrecognized character. Example: | |
|||
;Reference |
|||
The C and Python versions can be considered reference implementations. |
|||
;Implementations |
|||
=={{header|C}}== |
|||
<lang C> |
|||
#include <stdlib.h> |
|||
#include <stdio.h> |
|||
#include <stdarg.h> |
|||
#include <ctype.h> |
|||
#include <string.h> |
|||
#include <errno.h> |
|||
#include <stdbool.h> |
|||
#include <limits.h> |
|||
#define NELEMS(arr) (sizeof(arr) / sizeof(arr[0])) |
|||
#define da_dim(name, type) type *name = NULL; \ |
|||
int _qy_ ## name ## _p = 0; \ |
|||
int _qy_ ## name ## _max = 0 |
|||
#define da_rewind(name) _qy_ ## name ## _p = 0 |
|||
#define da_redim(name) if (_qy_ ## name ## _p >= _qy_ ## name ## _max) \ |
|||
name = realloc(name, (_qy_ ## name ## _max += 32) * sizeof(name[0])) |
|||
#define da_append(name, x) do {da_redim(name); name[_qy_ ## name ## _p++] = x;} while (0) |
|||
#define da_len(name) _qy_ ## name ## _p |
|||
// dependancy: atr table in parse.c ordering is based on these |
|||
typedef enum { |
|||
EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, |
|||
Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident |
|||
} TokenType; |
|||
typedef struct { |
|||
int tok; |
|||
int err_ln, err_col; |
|||
union { |
|||
int n; /* value for constants */ |
|||
char *text; /* text for idents */ |
|||
}; |
|||
} tok_s; |
|||
static FILE *source_fp, *dest_fp; |
|||
static int line = 1, col = 0, the_ch = ' '; |
|||
da_dim(text, char); |
|||
tok_s gettok(); |
|||
static void error(int err_line, int err_col, const char *fmt, ... ) { |
|||
char buf[1000]; |
|||
va_list ap; |
|||
va_start(ap, fmt); |
|||
vsprintf(buf, fmt, ap); |
|||
va_end(ap); |
|||
printf("(%d,%d) error: %s\n", err_line, err_col, buf); |
|||
exit(1); |
|||
} |
|||
static int next_ch() { /* get next char from input */ |
|||
the_ch = getc(source_fp); |
|||
++col; |
|||
if (the_ch == '\n') { |
|||
++line; |
|||
col = 0; |
|||
} |
|||
return the_ch; |
|||
} |
|||
static tok_s char_lit(int n, int err_line, int err_col) { /* 'x' */ |
|||
if (the_ch == '\'') |
|||
error(err_line, err_col, "gettok: empty character constant"); |
|||
if (the_ch == '\\') { |
|||
next_ch(); |
|||
if (the_ch == 'n') |
|||
n = 10; |
|||
else if (the_ch == '\\') |
|||
n = '\\'; |
|||
else error(err_line, err_col, "gettok: unknown escape sequence \\%c", the_ch); |
|||
} |
|||
if (next_ch() != '\'') |
|||
error(err_line, err_col, "multi-character constant"); |
|||
next_ch(); |
|||
return (tok_s){Integerk, err_line, err_col, {n}}; |
|||
} |
|||
static tok_s div_or_cmt(int err_line, int err_col) { /* process divide or comments */ |
|||
if (the_ch != '*') |
|||
return (tok_s){Div, err_line, err_col, {0}}; |
|||
/* comment found */ |
|||
for (;;) { |
|||
if (next_ch() == '*' && next_ch() == '/') { |
|||
next_ch(); |
|||
return gettok(); |
|||
} else if (the_ch == EOF) |
|||
error(err_line, err_col, "EOF in comment"); |
|||
} |
|||
} |
|||
static tok_s string_lit(int start, int err_line, int err_col) { /* "st" */ |
|||
da_rewind(text); |
|||
while (next_ch() != start) { |
|||
if (the_ch == '\n') error(err_line, err_col, "EOL in string"); |
|||
if (the_ch == EOF) error(err_line, err_col, "EOF in string"); |
|||
da_append(text, (char)the_ch); |
|||
} |
|||
da_append(text, '\0'); |
|||
next_ch(); |
|||
return (tok_s){Stringk, err_line, err_col, {.text=text}}; |
|||
} |
|||
static int kwd_cmp(const void *p1, const void *p2) { |
|||
return strcmp(*(char **)p1, *(char **)p2); |
|||
} |
|||
static TokenType get_ident_type(const char *ident) { |
|||
static struct { |
|||
char *s; |
|||
TokenType sym; |
|||
} kwds[] = { |
|||
{"if", If}, |
|||
{"print", Print}, |
|||
{"putc", Putc}, |
|||
{"while", While}, |
|||
}, *kwp; |
|||
return (kwp = bsearch(&ident, kwds, NELEMS(kwds), sizeof(kwds[0]), kwd_cmp)) == NULL ? Ident : kwp->sym; |
|||
} |
|||
static tok_s ident_or_int(int err_line, int err_col) { |
|||
int n, is_number = true; |
|||
da_rewind(text); |
|||
while (isalnum(the_ch) || the_ch == '_') { |
|||
da_append(text, (char)the_ch); |
|||
if (!isdigit(the_ch)) |
|||
is_number = false; |
|||
next_ch(); |
|||
} |
|||
if (da_len(text) == 0) |
|||
error(err_line, err_col, "gettok: unrecognized character (%d) '%c'\n", the_ch, the_ch); |
|||
da_append(text, '\0'); |
|||
if (isdigit(text[0])) { |
|||
if (!is_number) |
|||
error(err_line, err_col, "invalid number: %s\n", text); |
|||
n = strtol(text, NULL, 0); |
|||
if (n == LONG_MAX && errno == ERANGE) |
|||
error(err_line, err_col, "Number exceeds maximum value"); |
|||
return (tok_s){Integerk, err_line, err_col, {n}}; |
|||
} |
|||
return (tok_s){get_ident_type(text), err_line, err_col, {.text=text}}; |
|||
} |
|||
static tok_s follow(int expect, TokenType ifyes, TokenType ifno, int err_line, int err_col) { /* look ahead for '>=', etc. */ |
|||
if (the_ch == expect) { |
|||
next_ch(); |
|||
return (tok_s){ifyes, err_line, err_col, {0}}; |
|||
} |
|||
if (ifno == EOI) |
|||
error(err_line, err_col, "follow: unrecognized character '%c' (%d)\n", the_ch, the_ch); |
|||
return (tok_s){ifno, err_line, err_col, {0}}; |
|||
} |
|||
tok_s gettok() { /* return the token type */ |
|||
/* skip white space */ |
|||
while (isspace(the_ch)) |
|||
next_ch(); |
|||
int err_line = line; |
|||
int err_col = col; |
|||
switch (the_ch) { |
|||
case '{': next_ch(); return (tok_s){Lbrace, err_line, err_col, {0}}; |
|||
case '}': next_ch(); return (tok_s){Rbrace, err_line, err_col, {0}}; |
|||
case '(': next_ch(); return (tok_s){Lparen, err_line, err_col, {0}}; |
|||
case ')': next_ch(); return (tok_s){Rparen, err_line, err_col, {0}}; |
|||
case '+': next_ch(); return (tok_s){Add, err_line, err_col, {0}}; |
|||
case '-': next_ch(); return (tok_s){Sub, err_line, err_col, {0}}; |
|||
case '*': next_ch(); return (tok_s){Mul, err_line, err_col, {0}}; |
|||
case ';': next_ch(); return (tok_s){Semi, err_line, err_col, {0}}; |
|||
case ',': next_ch(); return (tok_s){Comma, err_line, err_col, {0}}; |
|||
case '>': next_ch(); return (tok_s){Gtr, err_line, err_col, {0}}; |
|||
case '=': next_ch(); return (tok_s){Assign, err_line, err_col, {0}}; |
|||
case '/': next_ch(); return div_or_cmt(err_line, err_col); |
|||
case '\'': next_ch(); return char_lit(the_ch, err_line, err_col); |
|||
case '<': next_ch(); return follow('=', Leq, Lss, err_line, err_col); |
|||
case '!': next_ch(); return follow('=', Neq, EOI, err_line, err_col); |
|||
case '&': next_ch(); return follow('&', And, EOI, err_line, err_col); |
|||
case '"' : return string_lit(the_ch, err_line, err_col); |
|||
default: return ident_or_int(err_line, err_col); |
|||
case EOF: return (tok_s){EOI, err_line, err_col, {0}}; |
|||
} |
|||
} |
|||
void run() { /* tokenize the given input */ |
|||
tok_s tok; |
|||
do { |
|||
tok = gettok(); |
|||
fprintf(dest_fp, "line %5d col %5d %.8s", |
|||
tok.err_ln, tok.err_col, |
|||
&"EOI Print Putc If While Lbrace Rbrace Lparen Rparen " |
|||
"Uminus Mul Div Add Sub Lss Gtr Leq Neq " |
|||
"And Semi Comma Assign Integer String Ident "[tok.tok * 9]); |
|||
if (tok.tok == Integerk) fprintf(dest_fp, " %4d", tok.n); |
|||
else if (tok.tok == Ident) fprintf(dest_fp, " %s", tok.text); |
|||
else if (tok.tok == Stringk) fprintf(dest_fp, " \"%s\"", tok.text); |
|||
fprintf(dest_fp, "\n"); |
|||
} while (tok.tok != EOI); |
|||
if (dest_fp != stdout) |
|||
fclose(dest_fp); |
|||
} |
|||
void init_io(FILE **fp, FILE *std, const char mode[], const char fn[]) { |
|||
if (fn[0] == '\0') |
|||
*fp = std; |
|||
else if ((*fp = fopen(fn, mode)) == NULL) |
|||
error(0, 0, "Can't open %s\n", fn); |
|||
} |
|||
int main(int argc, char *argv[]) { |
|||
init_io(&source_fp, stdin, "r", argc > 1 ? argv[1] : ""); |
|||
init_io(&dest_fp, stdout, "wb", argc > 2 ? argv[2] : ""); |
|||
run(); |
|||
} |
|||
</lang> |
|||
=={{header|Euphoria}}== |
|||
<lang euphoria> |
|||
include std/io.e |
|||
include std/map.e |
|||
include std/types.e |
|||
include std/convert.e |
|||
constant true = 1, false = 0, EOF = -1 |
|||
enum EOI, Printk, Putc, Ifk, Whilek, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, |
|||
Add, Sub, Lss, Gtr, Leq, Neq, Andk, Semi, Comma, Assign, Integerk, Stringk, Ident |
|||
constant all_syms = { "EOI", "Print", "Putc", "If", "While", "Lbrace", "Rbrace", "Lparen", |
|||
"Rparen", "Uminus", "Mul", "Div", "Add", "Sub", "Lss", "Gtr", "Leq", "Neq", "And", |
|||
"Semi", "Comma", "Assign", "Integer", "String", "Ident"} |
|||
integer input_file, the_ch = ' ', the_col = 0, the_line = 1 |
|||
sequence symbols |
|||
map key_words = new() |
|||
procedure error(sequence format, sequence data) |
|||
printf(STDOUT, format, data) |
|||
abort(1) |
|||
end procedure |
|||
-- get the next character from the input |
|||
function next_ch() |
|||
the_ch = getc(input_file) |
|||
the_col += 1 |
|||
if the_ch = '\n' then |
|||
the_line += 1 |
|||
the_col = 0 |
|||
end if |
|||
return the_ch |
|||
end function |
|||
-- 'x' - character constants |
|||
function char_lit(integer err_line, integer err_col) |
|||
integer n = next_ch() -- skip opening quote |
|||
if the_ch = '\'' then |
|||
error("%d %d empty character constant", {err_line, err_col}) |
|||
elsif the_ch = '\\' then |
|||
next_ch() |
|||
if the_ch = 'n' then |
|||
n = 10 |
|||
elsif the_ch = '\\' then |
|||
n = '\\' |
|||
else |
|||
error("%d %d unknown escape sequence \\%c", {err_line, err_col, the_ch}) |
|||
end if |
|||
end if |
|||
if next_ch() != '\'' then |
|||
error("%d %d multi-character constant", {err_line, err_col}) |
|||
end if |
|||
next_ch() |
|||
return {Integerk, err_line, err_col, n} |
|||
end function |
|||
-- process divide or comments |
|||
function div_or_cmt(integer err_line, integer err_col) |
|||
if next_ch() != '*' then |
|||
return {Div, err_line, err_col} |
|||
end if |
|||
-- comment found |
|||
while true do |
|||
if next_ch() = '*' and next_ch() = '/' then |
|||
next_ch() |
|||
return get_tok() |
|||
elsif the_ch = EOF then |
|||
error("%d %d EOF in comment", {err_line, err_col}) |
|||
end if |
|||
end while |
|||
end function |
|||
-- "string" |
|||
function string_lit(integer start, integer err_line, integer err_col) |
|||
string text = "" |
|||
while next_ch() != start do |
|||
if the_ch = EOF then |
|||
error("%d %d EOF while scanning string literal", {err_line, err_col}) |
|||
end if |
|||
if the_ch = '\n' then |
|||
error("%d %d EOL while scanning string literal", {err_line, err_col}) |
|||
end if |
|||
text &= the_ch |
|||
end while |
|||
next_ch() |
|||
return {Stringk, err_line, err_col, text} |
|||
end function |
|||
-- handle identifiers and integers |
|||
function ident_or_int(integer err_line, integer err_col) |
|||
integer n, is_number = true |
|||
string text = "" |
|||
while t_alnum(the_ch) or the_ch = '_' do |
|||
text &= the_ch |
|||
if not t_digit(the_ch) then |
|||
is_number = false |
|||
end if |
|||
next_ch() |
|||
end while |
|||
if length(text) = 0 then |
|||
error("%d %d ident_or_int: unrecognized character: (%d) '%s'", {err_line, err_col, the_ch, the_ch}) |
|||
end if |
|||
if t_digit(text[1]) then |
|||
if not is_number then |
|||
error("%d %d invalid number: %s", {err_line, err_col, text}) |
|||
end if |
|||
n = to_integer(text) |
|||
return {Integerk, err_line, err_col, n} |
|||
end if |
|||
if has(key_words, text) then |
|||
return {get(key_words, text), err_line, err_col} |
|||
end if |
|||
return {Ident, err_line, err_col, text} |
|||
end function |
|||
-- look ahead for '>=', etc. |
|||
function follow(integer expect, integer ifyes, integer ifno, integer err_line, integer err_col) |
|||
if next_ch() = expect then |
|||
next_ch() |
|||
return {ifyes, err_line, err_col} |
|||
end if |
|||
if ifno = EOI then |
|||
error("%d %d follow: unrecognized character: (%d)", {err_line, err_col, the_ch}) |
|||
end if |
|||
return {ifno, err_line, err_col} |
|||
end function |
|||
-- return the next token type |
|||
function get_tok() |
|||
while t_space(the_ch) do |
|||
next_ch() |
|||
end while |
|||
integer err_line = the_line |
|||
integer err_col = the_col |
|||
switch the_ch do |
|||
case EOF then return {EOI, err_line, err_col} |
|||
case '/' then return div_or_cmt(err_line, err_col) |
|||
case '\'' then return char_lit(err_line, err_col) |
|||
case '<' then return follow('=', Leq, Lss, err_line, err_col) |
|||
case '!' then return follow('=', Neq, EOI, err_line, err_col) |
|||
case '&' then return follow('&', Andk, EOI, err_line, err_col) |
|||
case '"' then return string_lit(the_ch, err_line, err_col) |
|||
case else |
|||
integer sym = symbols[the_ch] |
|||
if sym != EOI then |
|||
next_ch() |
|||
return {sym, err_line, err_col} |
|||
end if |
|||
return ident_or_int(err_line, err_col) |
|||
end switch |
|||
end function |
|||
procedure init() |
|||
put(key_words, "if", Ifk) |
|||
put(key_words, "print", Printk) |
|||
put(key_words, "putc", Putc) |
|||
put(key_words, "while", Whilek) |
|||
symbols = repeat(EOI, 256) |
|||
symbols['{'] = Lbrace |
|||
symbols['}'] = Rbrace |
|||
symbols['('] = Lparen |
|||
symbols[')'] = Rparen |
|||
symbols['+'] = Add |
|||
symbols['-'] = Sub |
|||
symbols['*'] = Mul |
|||
symbols[';'] = Semi |
|||
symbols[','] = Comma |
|||
symbols['>'] = Gtr |
|||
symbols['='] = Assign |
|||
end procedure |
|||
procedure main(sequence cl) |
|||
sequence file_name |
|||
input_file = STDIN |
|||
if length(cl) > 2 then |
|||
file_name = cl[3] |
|||
input_file = open(file_name, "r") |
|||
if input_file = -1 then |
|||
error("Could not open %s", {file_name}) |
|||
end if |
|||
end if |
|||
init() |
|||
sequence t |
|||
loop do |
|||
t = get_tok() |
|||
printf(STDOUT, "line %5d col %5d %-8s", {t[2], t[3], all_syms[t[1]]}) |
|||
switch t[1] do |
|||
case Integerk then printf(STDOUT, " %5d\n", {t[4]}) |
|||
case Ident then printf(STDOUT, " %s\n", {t[4]}) |
|||
case Stringk then printf(STDOUT, " \"%s\"\n", {t[4]}) |
|||
case else printf(STDOUT, "\n") |
|||
end switch |
|||
until t[1] = EOI |
|||
end loop |
|||
end procedure |
|||
main(command_line()) |
|||
</lang> |
|||
=={{header|FreeBASIC}}== |
|||
<lang FreeBASIC> |
|||
enum Token_type |
|||
tk_eoi |
|||
tk_print |
|||
tk_putc |
|||
tk_if |
|||
tk_while |
|||
tk_lbrace |
|||
tk_rbrace |
|||
tk_lparen |
|||
tk_rparen |
|||
tk_uminus |
|||
tk_mul |
|||
tk_div |
|||
tk_add |
|||
tk_sub |
|||
tk_lss |
|||
tk_gtr |
|||
tk_leq |
|||
tk_neq |
|||
tk_and |
|||
tk_semi |
|||
tk_comma |
|||
tk_assign |
|||
tk_integer |
|||
tk_string |
|||
tk_ident |
|||
end enum |
|||
const NewLine = chr(10) |
|||
const DoubleQuote = chr(34) |
|||
' where we store keywords and variables |
|||
type Symbol |
|||
s_name as string |
|||
tok as Token_type |
|||
end type |
|||
dim shared symtab() as Symbol |
|||
dim shared cur_line as string |
|||
dim shared cur_ch as string |
|||
dim shared line_num as integer |
|||
dim shared col_num as integer |
|||
function is_digit(byval ch as string) as long |
|||
is_digit = (ch <> "") and ch >= "0" and ch <= "9" |
|||
end function |
|||
function is_alnum(byval ch as string) as long |
|||
is_alnum = (ch <> "") and ((UCase(ch) >= "A" and UCase(ch) <= "Z") or (is_digit(ch))) |
|||
end function |
|||
sub error_msg(byval eline as integer, byval ecol as integer, byval msg as string) |
|||
print "("; eline; ":"; ecol; ")"; " "; msg |
|||
system |
|||
end sub |
|||
' add an identifier to the symbol table |
|||
function install(byval s_name as string, byval tok as Token_type) as integer |
|||
dim n as integer |
|||
n = ubound(symtab) |
|||
redim preserve symtab(n + 1) |
|||
n = ubound(symtab) |
|||
symtab(n).s_name = s_name |
|||
symtab(n).tok = tok |
|||
return n |
|||
end function |
|||
' search for an identifier in the symbol table |
|||
function lookup(byval s_name as string) as integer |
|||
dim i as integer |
|||
for i = lbound(symtab) to ubound(symtab) |
|||
if symtab(i).s_name = s_name then return i |
|||
next |
|||
return -1 |
|||
end function |
|||
sub next_line() ' read the next line of input from the source file |
|||
cur_line = "" |
|||
cur_ch = "" ' empty cur_ch means end-of-file |
|||
if eof(1) then exit sub |
|||
line input #1, cur_line |
|||
cur_line = cur_line + NewLine |
|||
line_num += + 1 |
|||
col_num = 1 |
|||
end sub |
|||
sub next_char() ' get the next char |
|||
cur_ch = "" |
|||
col_num += 1 |
|||
if col_num > len(cur_line) then next_line() |
|||
if col_num <= len(cur_line) then cur_ch = mid(cur_line, col_num, 1) |
|||
end sub |
|||
function follow(byval err_line as integer, byval err_col as integer, byval expect as string, byval ifyes as Token_type, byval ifno as Token_type) as Token_type |
|||
if cur_ch = expect then |
|||
next_char() |
|||
return ifyes |
|||
end if |
|||
if ifno = tk_eoi then error_msg(err_line, err_col, "follow unrecognized character: " + cur_ch) |
|||
return ifno |
|||
end function |
|||
sub gettok(byref err_line as integer, byref err_col as integer, byref tok as Token_type, byref v as string) |
|||
' skip whitespace |
|||
do while (cur_ch = " " or cur_ch = chr(9) or cur_ch = NewLine) and (cur_ch <> "") |
|||
next_char() |
|||
loop |
|||
err_line = line_num |
|||
err_col = col_num |
|||
select case cur_ch |
|||
case "": tok = tk_eoi: exit sub |
|||
case "{": tok = tk_lbrace: next_char(): exit sub |
|||
case "}": tok = tk_rbrace: next_char(): exit sub |
|||
case "(": tok = tk_lparen: next_char(): exit sub |
|||
case ")": tok = tk_rparen: next_char(): exit sub |
|||
case "+": tok = tk_add: next_char(): exit sub |
|||
case "-": tok = tk_sub: next_char(): exit sub |
|||
case "*": tok = tk_mul: next_char(): exit sub |
|||
case ";": tok = tk_semi: next_char(): exit sub |
|||
case ",": tok = tk_comma: next_char(): exit sub |
|||
case ">": tok = tk_gtr: next_char(): exit sub |
|||
case "=": tok = tk_assign: next_char(): exit sub |
|||
case "/": ' div or comment |
|||
next_char() |
|||
if cur_ch <> "*" then |
|||
tok = tk_div |
|||
exit sub |
|||
end if |
|||
' skip comments |
|||
do |
|||
next_char() |
|||
if cur_ch = "*" or cur_ch = "" then |
|||
next_char() |
|||
if cur_ch = "/" or cur_ch = "" then |
|||
next_char() |
|||
gettok(err_line, err_col, tok, v) |
|||
exit sub |
|||
end if |
|||
end if |
|||
loop |
|||
case "'": ' single char literals |
|||
next_char() |
|||
v = str(Asc(cur_ch)) |
|||
if cur_ch = "'" then error_msg(err_line, err_col, "empty character constant") |
|||
if cur_ch = "\" then |
|||
next_char() |
|||
if cur_ch = "n" then |
|||
v = "10" |
|||
elseif cur_ch = "\" then |
|||
v = Str(Asc("\")) |
|||
else error_msg(err_line, err_col, "unknown escape sequence: " + cur_ch) |
|||
end if |
|||
end if |
|||
next_char() |
|||
if cur_ch <> "'" then error_msg(err_line, err_col, "multi-character constant") |
|||
next_char() |
|||
tok = tk_integer |
|||
exit sub |
|||
case "<": next_char(): tok = follow(err_line, err_col, "=", tk_Leq, tk_Lss): exit sub |
|||
case "!": next_char(): tok = follow(err_line, err_col, "=", tk_Neq, tk_EOI): exit sub |
|||
case "&": next_char(): tok = follow(err_line, err_col, "&", tk_And, tk_EOI): exit sub |
|||
case DoubleQuote: ' string |
|||
v = cur_ch |
|||
next_char() |
|||
do while cur_ch <> DoubleQuote |
|||
if cur_ch = NewLine then error_msg(err_line, err_col, "EOL in string") |
|||
if cur_ch = "" then error_msg(err_line, err_col, "EOF in string") |
|||
v += cur_ch |
|||
next_char() |
|||
loop |
|||
v += cur_ch |
|||
next_char() |
|||
tok = tk_string |
|||
exit sub |
|||
case else ' integers or identifiers |
|||
dim is_number as boolean = is_digit(cur_ch) |
|||
v = "" |
|||
do while is_alnum(cur_ch) orelse cur_ch = "_" |
|||
if not is_digit(cur_ch) then is_number = false |
|||
v += cur_ch |
|||
next_char() |
|||
loop |
|||
if len(v) = 0 then error_msg(err_line, err_col, "unknown character: " + cur_ch) |
|||
if is_digit(mid(v, 1, 1)) then |
|||
if not is_number then error_msg(err_line, err_col, "invalid number: " + v) |
|||
tok = tk_integer |
|||
exit sub |
|||
end if |
|||
dim as integer index = lookup(v) |
|||
if index = -1 then |
|||
tok = tk_ident |
|||
else |
|||
tok = symtab(index).tok |
|||
end if |
|||
exit sub |
|||
end select |
|||
end sub |
|||
sub init_lex(byval filein as string) |
|||
install("if", tk_if) |
|||
install("print", tk_print) |
|||
install("putc", tk_putc) |
|||
install("while", tk_while) |
|||
open filein for input as #1 |
|||
cur_line = "" |
|||
line_num = 0 |
|||
col_num = 0 |
|||
next_char() |
|||
end sub |
|||
sub scanner() |
|||
dim err_line as integer |
|||
dim err_col as integer |
|||
dim tok as Token_type |
|||
dim v as string |
|||
dim tok_list(tk_eoi to tk_ident) as string |
|||
tok_list(tk_eoi ) = "EOI" |
|||
tok_list(tk_print ) = "Print" |
|||
tok_list(tk_putc ) = "Putc" |
|||
tok_list(tk_if ) = "If" |
|||
tok_list(tk_while ) = "While" |
|||
tok_list(tk_lbrace ) = "Lbrace" |
|||
tok_list(tk_rbrace ) = "Rbrace" |
|||
tok_list(tk_lparen ) = "Lparen" |
|||
tok_list(tk_rparen ) = "Rparen" |
|||
tok_list(tk_uminus ) = "Uminus" |
|||
tok_list(tk_mul ) = "Mul" |
|||
tok_list(tk_div ) = "Div" |
|||
tok_list(tk_add ) = "Add" |
|||
tok_list(tk_sub ) = "Sub" |
|||
tok_list(tk_lss ) = "Lss" |
|||
tok_list(tk_gtr ) = "Gtr" |
|||
tok_list(tk_leq ) = "Leq" |
|||
tok_list(tk_neq ) = "Neq" |
|||
tok_list(tk_and ) = "And" |
|||
tok_list(tk_semi ) = "Semi" |
|||
tok_list(tk_comma ) = "Comma" |
|||
tok_list(tk_assign ) = "Assign" |
|||
tok_list(tk_integer) = "Integer" |
|||
tok_list(tk_string ) = "String" |
|||
tok_list(tk_ident ) = "Ident" |
|||
do |
|||
gettok(err_line, err_col, tok, v) |
|||
print using "line ##### col ##### \ \"; err_line; err_col; tok_list(tok); |
|||
if tok = tk_integer orelse tok = tk_ident orelse tok = tk_string then print " " + v; |
|||
print |
|||
loop until tok = tk_eoi |
|||
end sub |
|||
sub main() |
|||
if command(1) = "" then print "filename required" : system |
|||
init_lex(command(1)) |
|||
scanner() |
|||
end sub |
|||
main() |
|||
system |
|||
</lang> |
|||
=={{header|Python}}== |
|||
<lang Python> |
|||
from __future__ import print_function |
|||
import sys |
|||
# following two must remain in the same order |
|||
EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, \ |
|||
Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident = range(25) |
|||
all_syms = [ 'EOI', 'Print', 'Putc', 'If', 'While', 'Lbrace', 'Rbrace', 'Lparen', |
|||
'Rparen', 'Uminus', 'Mul', 'Div', 'Add', 'Sub', 'Lss', 'Gtr', 'Leq', 'Neq', 'And', |
|||
'Semi', 'Comma', 'Assign', 'Integer', 'String', 'Ident' ] |
|||
# single character only symbols |
|||
symbols = { '{': Lbrace, '}': Rbrace, '(': Lparen, ')': Rparen, '+': Add, '-': Sub, |
|||
'*': Mul, ';': Semi, ',': Comma, '>': Gtr, '=': Assign } |
|||
key_words = { 'if': If, 'print': Print, 'putc': Putc, 'while': While } |
|||
the_ch = " " # dummy first char - but it must be a space |
|||
the_col = 0 |
|||
the_line = 1 |
|||
input_file = None |
|||
#*** show error and exit |
|||
def error(line, col, msg): |
|||
print(line, col, msg) |
|||
exit(1) |
|||
#*** get the next character from the input |
|||
def next_ch(): |
|||
global the_ch, the_col, the_line |
|||
the_ch = input_file.read(1) |
|||
the_col += 1 |
|||
if the_ch == '\n': |
|||
the_line += 1 |
|||
the_col = 0 |
|||
return the_ch |
|||
#*** 'x' - character constants |
|||
def char_lit(err_line, err_col): |
|||
n = ord(next_ch()) # skip opening quote |
|||
if the_ch == '\'': |
|||
error(err_line, err_col, "empty character constant") |
|||
elif the_ch == '\\': |
|||
next_ch() |
|||
if the_ch == 'n': |
|||
n = 10 |
|||
elif the_ch == '\\': |
|||
n = '\\' |
|||
else: |
|||
error(err_line, err_col, "unknown escape sequence \\%c" % (the_ch)) |
|||
if next_ch() != '\'': |
|||
error(err_line, err_col, "multi-character constant") |
|||
next_ch() |
|||
return Integerk, err_line, err_col, n |
|||
#*** process divide or comments |
|||
def div_or_cmt(err_line, err_col): |
|||
if next_ch() != '*': |
|||
return Div, err_line, err_col |
|||
# comment found |
|||
while True: |
|||
if next_ch() == '*' and next_ch() == '/': |
|||
next_ch() |
|||
return gettok() |
|||
elif len(the_ch) == 0: |
|||
error(err_line, err_col, "EOF in comment") |
|||
#*** "string" |
|||
def string_lit(start, err_line, err_col): |
|||
text = "" |
|||
while next_ch() != start: |
|||
if len(the_ch) == 0: |
|||
error(err_line, err_col, "EOF while scanning string literal") |
|||
if the_ch == '\n': |
|||
error(err_line, err_col, "EOL while scanning string literal") |
|||
text += the_ch |
|||
next_ch() |
|||
return Stringk, err_line, err_col, text |
|||
#*** handle identifiers and integers |
|||
def ident_or_int(err_line, err_col): |
|||
is_number = True |
|||
text = "" |
|||
while the_ch.isalnum() or the_ch == '_': |
|||
text += the_ch |
|||
if not the_ch.isdigit(): |
|||
is_number = False |
|||
next_ch() |
|||
if len(text) == 0: |
|||
error(err_line, err_col, "ident_or_int: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch)) |
|||
if text[0].isdigit(): |
|||
if not is_number: |
|||
error(err_line, err_col, "invalid number: %s" % (text)) |
|||
n = int(text) |
|||
return Integerk, err_line, err_col, n |
|||
if text in key_words: |
|||
return key_words[text], err_line, err_col |
|||
return Ident, err_line, err_col, text |
|||
#*** look ahead for '>=', etc. |
|||
def follow(expect, ifyes, ifno, err_line, err_col): |
|||
if next_ch() == expect: |
|||
next_ch() |
|||
return ifyes, err_line, err_col |
|||
if ifno == EOI: |
|||
error(err_line, err_col, "follow: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch)) |
|||
return ifno, err_line, err_col |
|||
#*** return the next token type |
|||
def gettok(): |
|||
while the_ch.isspace(): |
|||
next_ch() |
|||
err_line = the_line |
|||
err_col = the_col |
|||
if len(the_ch) == 0: return EOI, err_line, err_col |
|||
elif the_ch == '/': return div_or_cmt(err_line, err_col) |
|||
elif the_ch == '\'': return char_lit(err_line, err_col) |
|||
elif the_ch == '<': return follow('=', Leq, Lss, err_line, err_col) |
|||
elif the_ch == '!': return follow('=', Neq, EOI, err_line, err_col) |
|||
elif the_ch == '&': return follow('&', And, EOI, err_line, err_col) |
|||
elif the_ch == '"': return string_lit(the_ch, err_line, err_col) |
|||
elif the_ch in symbols: |
|||
sym = symbols[the_ch] |
|||
next_ch() |
|||
return sym, err_line, err_col |
|||
else: return ident_or_int(err_line, err_col) |
|||
#*** main driver |
|||
input_file = sys.stdin |
|||
if len(sys.argv) > 1: |
|||
try: |
|||
input_file = open(sys.argv[1], "r", 4096) |
|||
except IOError as e: |
|||
error(0, 0, "Can't open %s" % sys.argv[1]) |
|||
while True: |
|||
t = gettok() |
|||
tok = t[0] |
|||
line = t[1] |
|||
col = t[2] |
|||
print("line %5d col %5d %-8s" % (line, col, all_syms[tok]), end='') |
|||
if tok == Integerk: print(" %5d" % (t[3])) |
|||
elif tok == Ident: print(" %s" % (t[3])) |
|||
elif tok == Stringk: print(' "%s"' % (t[3])) |
|||
else: print("") |
|||
if tok == EOI: |
|||
break |
|||
</lang> |
Latest revision as of 03:37, 14 August 2016
Hello, World!