Ed Davis

Joined 31 August 2022
m
Replaced content with "Hello, World!"
No edit summary
m (Replaced content with "Hello, World!")
 
(6 intermediate revisions by the same user not shown)
Line 1:
Hello, World!
Lexical analysis is the process of converting a sequence of characters (such as in a
computer program or web page) into a sequence of tokens (strings with an identified
"meaning"). A program that performs lexical analysis may be called a lexer, tokenizer,
or scanner (though "scanner" is also used to refer to the first stage of a lexer).
 
;The Task
 
Create a lexical analyzer for the Tiny programming language. The
program should read input from a file and/or stdin, and write
output to a file and/or stdout.
 
;Specification
 
The various token types are denoted below.
 
;Operators
 
{| class="wikitable"
|-
! Characters !! Common name !! Name
|-
| * || multiply || Mul
|-
| / || divide || Div
|-
| + || plus || Add
|-
| - || minus and unary minus || Sub and Uminus
|-
| < || less than || Lss
|-
| <= || less than or equal || Leq
|-
| > || greater than || Gtr
|-
| != || not equal || Neq
|-
| = || assign || Assign
|-
| && || and || And
|}
 
;Symbols
 
{| class="wikitable"
|-
! Characters !! Common name !! Name
|-
| ( || left parenthesis || Lparen
|-
| ) || right parenthesis || Rparen
|-
| { || left brace || Lbrace
|-
| } || right brace || Rbrace
|-
| ; || semi colon || Semi
|-
| , || comma || Comma
|}
 
;Keywords
 
{| class="wikitable"
|-
! Characters !! Name
|-
| if || If
|-
| while || While
|-
| print || Print
|-
| putc || Putc
|}
 
;Other entities
 
{| class="wikitable"
|-
! Characters !! Regular expression !! Name
|-
| integers || [0-9]+ || Integer
|-
| char literal || 'x' || Integer
|-
| identifiers || [_a-zA-Z][_a-zA-Z0-9]+ || Ident
|-
| string literal || ".*" || String
|}
 
Notes: For char literals, '\n' is supported as a new line
character. To represent \, use: '\\'. \n may also be used in
Strings, to print a newline. No other special sequences are
supported.
 
'''Comments''' /* ... */ (multi-line)
 
;Complete list of token names
 
'''EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident'''
 
;Program output
 
Output of the program should be:
 
* the word line, followed by:
* the line number where the token starts, followed by:
* the abbreviation col, followed by:
* the column number where the token starts, followed by:
* the token name.
* If the token name is one of Integer, Ident or String, the actual value of the same should follow.
 
;Test Cases
 
<lang c>
/*
Hello world
*/
print("Hello, World!\n");
</lang>
 
;Output
 
{| class="wikitable"
|-
| line || 4 || col || 1 || Print || &nbsp;
|-
| line || 4 || col || 6 || Lparen || &nbsp;
|-
| line || 4 || col || 7 || String || "Hello, World!\n"
|-
| line || 4 || col || 24 || Rparen || &nbsp;
|-
| line || 4 || col || 25 || Semi || &nbsp;
|-
| line || 5 || col || 1 || EOI || &nbsp;
|}
 
<lang c>
/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, "\n");
</lang>
 
;Output
 
{| class="wikitable"
|-
| line || 1 || col || 1 || Ident || phoenix_number
|-
| line || 1 || col || 16 || Assign || &nbsp;
|-
| line || 1 || col || 18 || Integer || 142857
|-
| line || 1 || col || 24 || Semi || &nbsp;
|-
| line || 2 || col || 1 || Print || &nbsp;
|-
| line || 2 || col || 6 || Lparen || &nbsp;
|-
| line || 2 || col || 7 || Ident || phoenix_number
|-
| line || 2 || col || 21 || Comma || &nbsp;
|-
| line || 2 || col || 23 || String || "\n"
|-
| line || 2 || col || 27 || Rparen || &nbsp;
|-
| line || 2 || col || 28 || Semi || &nbsp;
|-
| line || 3 || col || 1 || EOI || &nbsp;
|}
 
;Diagnostics
The following error conditions should be caught:
 
* Empty character constant. Example: &apos;&apos;
* Unknown escape sequence. Example: '\r'
* Multi-character constant. Example: 'xx'
* End-of-file in comment. Closing comment characters not found.
* End-of-file while scanning string literal. Closing string character not found.
* End-of-line while scanning string literal. Closing string character not found before end-of-line.
* Unrecognized character. Example: |
 
;Reference
 
The C and Python versions can be considered reference implementations.
 
;Implementations
 
__TOC__
 
=={{header|C}}==
<lang C>
 
#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <ctype.h>
#include <string.h>
#include <errno.h>
#include <stdbool.h>
 
#define NELEMS(arr) (sizeof(arr) / sizeof(arr[0]))
 
#define da_dim(name, type) type *name = NULL; \
int _qy_ ## name ## _p = 0; \
int _qy_ ## name ## _max = 0
#define da_rewind(name) _qy_ ## name ## _p = 0
#define da_redim(name) if (_qy_ ## name ## _p >= _qy_ ## name ## _max) \
name = realloc(name, (_qy_ ## name ## _max += 32) * sizeof(name[0]))
#define da_append(name, x) do {da_redim(name); name[_qy_ ## name ## _p++] = x;} while (0)
#define da_len(name) _qy_ ## name ## _p
 
// dependancy: atr table in parse.c ordering is based on these
typedef enum {
EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add,
Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident
} TokenType;
 
typedef struct {
int tok;
int err_ln, err_col;
union {
int n; /* value for constants */
char *text; /* text for idents */
};
} tok_s;
 
static FILE *source_fp, *dest_fp;
static int line, col, the_ch;
da_dim(text, char);
 
tok_s gettok();
 
static void error(int err_line, int err_col, const char *fmt, ... ) {
char buf[1000];
va_list ap;
 
va_start(ap, fmt);
vsprintf(buf, fmt, ap);
va_end(ap);
printf("(%d,%d) error: %s\n", err_line, err_col, buf);
exit(1);
}
 
static void read_ch() { /* get next char from input */
the_ch = getc(source_fp);
++col;
if (the_ch == '\n') {
++line;
col = 0;
}
}
 
static tok_s char_lit(int n, int err_line, int err_col) { /* 'x' */
if (the_ch == '\'')
error(err_line, err_col, "gettok: empty character constant");
if (the_ch == '\\') {
read_ch();
if (the_ch == 'n')
n = 10;
else if (the_ch == '\\')
n = '\\';
else error(err_line, err_col, "gettok: unknown escape sequence \\%c", the_ch);
}
read_ch();
if (the_ch != '\'') error(err_line, err_col, "multi-character constant");
read_ch();
return (tok_s){Integerk, err_line, err_col, {n}};
}
 
static tok_s div_or_cmt(int err_line, int err_col) { /* process divide or comments */
if (the_ch != '*')
return (tok_s){Div, err_line, err_col, {0}};
 
/* comment found */
for (;;) {
read_ch();
if (the_ch == '*' || the_ch == EOF) {
read_ch();
if (the_ch == '/' || the_ch == EOF) {
read_ch();
return gettok();
}
}
}
}
 
static tok_s string_lit(int start, int err_line, int err_col) { /* "st" */
da_rewind(text);
 
for (read_ch(); the_ch != start; read_ch()) {
if (the_ch == '\n')
error(err_line, err_col, "EOL in string");
if (the_ch == EOF)
error(err_line, err_col, "EOF in string");
da_append(text, (char)the_ch);
}
da_append(text, '\0');
 
read_ch();
return (tok_s){Stringk, err_line, err_col, {.text=text}};
}
 
static int kwd_cmp(const void *p1, const void *p2) {
return stricmp(*(char **)p1, *(char **)p2);
}
 
static TokenType get_ident_type(const char *ident) {
static struct {
char *s;
TokenType sym;
} kwds[] = {
{"if", If},
{"print", Print},
{"putc", Putc},
{"while", While},
}, *kwp;
 
return (kwp = bsearch(&ident, kwds, NELEMS(kwds), sizeof(kwds[0]), kwd_cmp)) == NULL ? Ident : kwp->sym;
}
 
static tok_s ident_or_int(int err_line, int err_col) {
int n, is_number = true;
 
da_rewind(text);
while (isalnum(the_ch) || the_ch == '_') {
da_append(text, (char)the_ch);
if (!isdigit(the_ch))
is_number = false;
read_ch();
}
if (da_len(text) == 0)
error(err_line, err_col, "gettok: unrecognized character (%d) '%c'\n", the_ch, the_ch);
da_append(text, '\0');
if (isdigit(text[0])) {
if (!is_number)
error(err_line, err_col, "invalid number: %s\n", text);
n = strtol(text, NULL, 0);
if (n == LONG_MAX && errno == ERANGE)
error(err_line, err_col, "Number exceeds maximum value");
return (tok_s){Integerk, err_line, err_col, {n}};
}
return (tok_s){get_ident_type(text), err_line, err_col, {.text=text}};
}
 
static tok_s follow(int expect, TokenType ifyes, TokenType ifno, int err_line, int err_col) { /* look ahead for '>=', etc. */
if (the_ch == expect) {
read_ch();
return (tok_s){ifyes, err_line, err_col, {0}};
}
if (ifno == EOI) error(err_line, err_col, "follow: unrecognized character '%c' (%d)\n", the_ch, the_ch);
return (tok_s){ifno, err_line, err_col, {0}};
}
 
tok_s gettok() { /* return the token type */
/* skip white space */
while (isspace(the_ch))
read_ch();
int err_line = line;
int err_col = col;
switch (the_ch) {
case '{': read_ch(); return (tok_s){Lbrace, err_line, err_col, {0}};
case '}': read_ch(); return (tok_s){Rbrace, err_line, err_col, {0}};
case '(': read_ch(); return (tok_s){Lparen, err_line, err_col, {0}};
case ')': read_ch(); return (tok_s){Rparen, err_line, err_col, {0}};
case '+': read_ch(); return (tok_s){Add, err_line, err_col, {0}};
case '-': read_ch(); return (tok_s){Sub, err_line, err_col, {0}};
case '*': read_ch(); return (tok_s){Mul, err_line, err_col, {0}};
case ';': read_ch(); return (tok_s){Semi, err_line, err_col, {0}};
case ',': read_ch(); return (tok_s){Comma, err_line, err_col, {0}};
case '>': read_ch(); return (tok_s){Gtr, err_line, err_col, {0}};
case '=': read_ch(); return (tok_s){Assign, err_line, err_col, {0}};
case '/': read_ch(); return div_or_cmt(err_line, err_col);
case '\'': read_ch(); return char_lit(the_ch, err_line, err_col);
case '<': read_ch(); return follow('=', Leq, Lss, err_line, err_col);
case '!': read_ch(); return follow('=', Neq, EOI, err_line, err_col);
case '&': read_ch(); return follow('&', And, EOI, err_line, err_col);
case '"' : return string_lit(the_ch, err_line, err_col);
default: return ident_or_int(err_line, err_col);
case EOF: return (tok_s){EOI, err_line, err_col, {0}};
}
}
 
void init_lex() { /* initialize the scanner */
line = 1;
read_ch();
}
 
void run() { /* tokenize the given input */
tok_s tok;
do {
tok = gettok();
fprintf(dest_fp, "line %5d col %5d %.8s",
tok.err_ln, tok.err_col,
&"EOI Print Putc If While Lbrace Rbrace Lparen Rparen "
"Uminus Mul Div Add Sub Lss Gtr Leq Neq "
"And Semi Comma Assign Integer String Ident "[tok.tok * 9]);
if (tok.tok == Integerk)
fprintf(dest_fp, " %8d", tok.n);
else if (tok.tok == Ident)
fprintf(dest_fp, " %s", tok.text);
else if (tok.tok == Stringk)
fprintf(dest_fp, " \"%s\"", tok.text);
fprintf(dest_fp, "\n");
} while (tok.tok != EOI);
if (dest_fp != stdout)
fclose(dest_fp);
}
 
void init_io(FILE **fp, FILE *std, const char mode[], const char fn[]) {
if (fn[0] == '\0')
*fp = std;
else if ((*fp = fopen(fn, mode)) == NULL)
error(0, 0, "Can't open %s\n", fn);
}
 
int main(int argc, char *argv[]) {
init_io(&source_fp, stdin, "r", argc > 1 ? argv[1] : "");
init_io(&dest_fp, stdout, "wb", argc > 2 ? argv[2] : "");
init_lex();
run();
}
</lang>
 
=={{header|Python}}==
<lang Python>
 
import sys
 
# following two must remain in the same order
EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, \
Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident = range(25)
 
all_syms = [ 'EOI', 'Print', 'Putc', 'If', 'While', 'Lbrace', 'Rbrace', 'Lparen',
'Rparen', 'Uminus', 'Mul', 'Div', 'Add', 'Sub', 'Lss', 'Gtr', 'Leq', 'Neq', 'And',
'Semi', 'Comma', 'Assign', 'Integer', 'String', 'Ident' ]
 
# single character only symbols
symbols = { '{': Lbrace, '}': Rbrace, '(': Lparen, ')': Rparen, '+': Add, '-': Sub,
'*': Mul, ';': Semi, ',': Comma, '>': Gtr, '=': Assign }
 
key_words = { 'if': If, 'print': Print, 'putc': Putc, 'while': While }
 
the_ch = " " # dummy first char - but it must be a space
the_col = 0
the_line = 1
input_file = None
 
#*** show error and exit
def error(line, col, msg):
print line, col, msg
exit(1)
 
#*** get the next character from the input
def getc():
global the_ch, the_col, the_line
 
the_ch = input_file.read(1)
the_col += 1
if the_ch == '\n':
the_line += 1
the_col = 0
return the_ch
 
#*** 'x' - character constants
def char_lit(err_line, err_col):
n = ord(getc()) # skip opening quote
if the_ch == '\'':
error(err_line, err_col, "empty character constant")
elif the_ch == '\\':
getc()
if the_ch == 'n':
n = 10
elif the_ch == '\\':
n = '\\'
else:
error(err_line, err_col, "unknown escape sequence \\%c" % (the_ch))
if getc() != '\'':
error(err_line, err_col, "multi-character constant")
getc()
return Integerk, err_line, err_col, n
 
#*** process divide or comments
def div_or_cmt(err_line, err_col):
if getc() != '*':
return Div, err_line, err_col
 
# comment found
while True:
if getc() == '*' and getc() == '/':
getc()
return gettok()
elif len(the_ch) == 0:
error(err_line, err_col, "EOF in comment")
 
#*** "string"
def string_lit(start, err_line, err_col):
text = ""
 
while getc() != start:
if len(the_ch) == 0:
error(err_line, err_col, "EOF while scanning string literal")
if the_ch == '\n':
error(err_line, err_col, "EOL while scanning string literal")
text += the_ch
 
getc()
return Stringk, err_line, err_col, text
 
#*** handle identifiers and integers
def ident_or_int(err_line, err_col):
is_number = True
text = ""
 
while the_ch.isalnum() or the_ch == '_':
text += the_ch
if not the_ch.isdigit():
is_number = False
getc()
 
if len(text) == 0:
error(err_line, err_col, "ident_or_int: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))
 
if text[0].isdigit():
if not is_number:
error(err_line, err_col, "invalid number: %s" % (text))
n = int(text)
return Integerk, err_line, err_col, n
 
if text in key_words:
return key_words[text], err_line, err_col
 
return Ident, err_line, err_col, text
 
#*** look ahead for '>=', etc.
def follow(expect, ifyes, ifno, err_line, err_col):
if getc() == expect:
getc()
return ifyes, err_line, err_col
 
if ifno == EOI:
error(err_line, err_col, "follow: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))
 
return ifno, err_line, err_col
 
#*** return the next token type
def gettok():
while the_ch.isspace():
getc()
 
err_line = the_line
err_col = the_col
 
if len(the_ch) == 0: return EOI, err_line, err_col
elif the_ch in symbols: sym = symbols[the_ch]; getc(); return sym, err_line, err_col
elif the_ch == '/': return div_or_cmt(err_line, err_col)
elif the_ch == '\'': return char_lit(err_line, err_col)
elif the_ch == '<': return follow('=', Leq, Lss, err_line, err_col)
elif the_ch == '!': return follow('=', Neq, EOI, err_line, err_col)
elif the_ch == '&': return follow('&', And, EOI, err_line, err_col)
elif the_ch == '"': return string_lit(the_ch, err_line, err_col)
else: return ident_or_int(err_line, err_col)
 
#*** main driver
input_file = sys.stdin
if len(sys.argv) > 1:
try:
input_file = open(sys.argv[1], "r", 4096)
except IOError as e:
error(0, 0, "Can't open %s" % sys.argv[1])
 
while True:
t = gettok()
tok = t[0]
line = t[1]
col = t[2]
 
if tok == Integerk:
print "line %5d col %5d %-8s %8d" % (line, col, all_syms[tok], t[3])
elif tok == Ident:
print "line %5d col %5d %-8s %s" % (line, col, all_syms[tok], t[3])
elif tok == Stringk:
print 'line %5d col %5d %-8s "%s"' % (line, col, all_syms[tok], t[3])
else:
print "line %5d col %5d %-8s" % (line, col, all_syms[tok])
 
if tok == EOI:
break
</lang>
155

edits