User:Ed Davis: Difference between revisions

From Rosetta Code
Content added Content deleted
No edit summary
(Replaced content with "Hello")
Line 1: Line 1:
Hello
{{task}}Lexical Analyzer

From [https://en.wikipedia.org/wiki/Lexical_analysis Wikipedia]

Lexical analysis is the process of converting a sequence of characters (such as in a
computer program or web page) into a sequence of tokens (strings with an identified
"meaning"). A program that performs lexical analysis may be called a lexer, tokenizer,
or scanner (though "scanner" is also used to refer to the first stage of a lexer).

;The Task

Create a lexical analyzer for the Tiny programming language. The
program should read input from a file and/or stdin, and write
output to a file and/or stdout.

;Specification

The various token types are denoted below.

;Operators

{| class="wikitable"
|-
! Characters !! Common name !! Name
|-
| '*' || multiply || Mul
|-
| '/' || divide || Div
|-
| '+' || plus || Add
|-
| '-' || minus and unary minus || Sub and Uminus
|-
| '<' || less than || Lss
|-
| '<=' || less than or equal || Leq
|-
| '>' || greater than || Gtr
|-
| '!=' || not equal || Neq
|-
| '=' || assign || Assign
|-
| '&&' || and || And
|}

;Symbols

{| class="wikitable"
|-
! Characters !! Common name !! Name
|-
| '(' || left parenthesis || Lparen
|-
| ')' || right parenthesis || Rparen
|-
| '{' || left brace || Lbrace
|-
| '}' || right brace || Rbrace
|-
| ';' || semi colon || Semi
|-
| ',' || comma || Comma
|}

;Keywords

{| class="wikitable"
|-
! Characters !! Name
|-
| "if" || If
|-
| "while" || While
|-
| "print" || Print
|-
| "putc" || Putc
|}

;Other entities

{| class="wikitable"
|-
! Characters !! Regular expression !! Name
|-
| integers || [0-9]+ || Integer
|-
| char literal || 'x' || Integer
|-
| identifiers || [_a-zA-Z][_a-zA-Z0-9]+ || Ident
|-
| string literal || ".*" || String
|}

Notes: For char literals, '\n' is supported as a new line
character. To represent \, use: '\\'. \n may also be used in
Strings, to print a newline. No other special sequences are
supported.

'''Comments''' /* ... */ (multi-line)

;Complete list of token names

'''EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident'''

;Program output

Output of the program should be the line and column where the
found token starts, followed by the Token name. For tokens
Integer, Ident and String, the Integer, identifier, or string
should follow.

;Test Cases

<lang c>
/*
Hello world
*/
print("Hello, World!\n");
</lang>

;Output

{| class="wikitable"
|-
| line || 4 || col || 1 || Print || &nbsp;
|-
| line || 4 || col || 6 || Lparen || &nbsp;
|-
| line || 4 || col || 7 || String || "Hello, World!\n"
|-
| line || 4 || col || 24 || Rparen || &nbsp;
|-
| line || 4 || col || 25 || Semi || &nbsp;
|-
| line || 5 || col || 1 || EOI || &nbsp;
|}

<lang c>
/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, "\n");
</lang>

;Output

{| class="wikitable"
|-
| line || 1 || col || 1 || Ident || phoenix_number
|-
| line || 1 || col || 16 || Assign || &nbsp;
|-
| line || 1 || col || 18 || Integer || 142857
|-
| line || 1 || col || 24 || Semi || &nbsp;
|-
| line || 2 || col || 1 || Print || &nbsp;
|-
| line || 2 || col || 6 || Lparen || &nbsp;
|-
| line || 2 || col || 7 || Ident || phoenix_number
|-
| line || 2 || col || 21 || Comma || &nbsp;
|-
| line || 2 || col || 23 || String || "\n"
|-
| line || 2 || col || 27 || Rparen || &nbsp;
|-
| line || 2 || col || 28 || Semi || &nbsp;
|-
| line || 3 || col || 1 || EOI || &nbsp;
|}

;Diagnostics
The following error conditions should be caught:

* Empty character constant. Example: ''
* Unknown escape sequence. Example: '\r'
* Multi-character constant. Example: 'xx'
* End-of-file in comment. Closing comment characters not found.
* End-of-file while scanning string literal. Closing string character not found.
* End-of-line while scanning string literal. Closing string character not found before end-of-line.
* Unrecognized character. Example: |

Refer additional questions to the C and Python implementations.

=={{header|C}}==
<lang C>

#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <ctype.h>
#include <string.h>
#include <errno.h>
#include <stdbool.h>

#define NELEMS(arr) (sizeof(arr) / sizeof(arr[0]))

#define da_dim(name, type) type *name = NULL; \
int _qy_ ## name ## _p = 0; \
int _qy_ ## name ## _max = 0
#define da_rewind(name) _qy_ ## name ## _p = 0
#define da_redim(name) if (_qy_ ## name ## _p >= _qy_ ## name ## _max) \
name = realloc(name, (_qy_ ## name ## _max += 32) * sizeof(name[0]))
#define da_append(name, x) do {da_redim(name); name[_qy_ ## name ## _p++] = x;} while (0)
#define da_len(name) _qy_ ## name ## _p

// dependancy: atr table in parse.c ordering is based on these
typedef enum {
EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add,
Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident
} TokenType;

typedef struct {
int tok;
int err_ln, err_col;
union {
int n; /* value for constants */
char *text; /* text for idents */
};
} tok_s;

static FILE *source_fp, *dest_fp;
static int line, col, the_ch;
da_dim(text, char);

tok_s gettok();

static void error(int err_line, int err_col, const char *fmt, ... ) {
char buf[1000];
va_list ap;

va_start(ap, fmt);
vsprintf(buf, fmt, ap);
va_end(ap);
printf("(%d,%d) error: %s\n", err_line, err_col, buf);
exit(1);
}

static void read_ch() { /* get next char from input */
the_ch = getc(source_fp);
++col;
if (the_ch == '\n') {
++line;
col = 0;
}
}

static tok_s char_lit(int n, int err_line, int err_col) { /* 'x' */
if (the_ch == '\'')
error(err_line, err_col, "gettok: empty character constant");
if (the_ch == '\\') {
read_ch();
if (the_ch == 'n')
n = 10;
else if (the_ch == '\\')
n = '\\';
else error(err_line, err_col, "gettok: unknown escape sequence \\%c", the_ch);
}
read_ch();
if (the_ch != '\'') error(err_line, err_col, "multi-character constant");
read_ch();
return (tok_s){Integerk, err_line, err_col, {n}};
}

static tok_s div_or_cmt(int err_line, int err_col) { /* process divide or comments */
if (the_ch != '*')
return (tok_s){Div, err_line, err_col, {0}};

/* comment found */
for (;;) {
read_ch();
if (the_ch == '*' || the_ch == EOF) {
read_ch();
if (the_ch == '/' || the_ch == EOF) {
read_ch();
return gettok();
}
}
}
}

static tok_s string_lit(int start, int err_line, int err_col) { /* "st" */
da_rewind(text);

for (read_ch(); the_ch != start; read_ch()) {
if (the_ch == '\n')
error(err_line, err_col, "EOL in string");
if (the_ch == EOF)
error(err_line, err_col, "EOF in string");
da_append(text, (char)the_ch);
}
da_append(text, '\0');

read_ch();
return (tok_s){Stringk, err_line, err_col, {.text=text}};
}

static int kwd_cmp(const void *p1, const void *p2) {
return stricmp(*(char **)p1, *(char **)p2);
}

static TokenType get_ident_type(const char *ident) {
static struct {
char *s;
TokenType sym;
} kwds[] = {
{"if", If},
{"print", Print},
{"putc", Putc},
{"while", While},
}, *kwp;

return (kwp = bsearch(&ident, kwds, NELEMS(kwds), sizeof(kwds[0]), kwd_cmp)) == NULL ? Ident : kwp->sym;
}

static tok_s ident_or_int(int err_line, int err_col) {
int n, is_number = true;

da_rewind(text);
while (isalnum(the_ch) || the_ch == '_') {
da_append(text, (char)the_ch);
if (!isdigit(the_ch))
is_number = false;
read_ch();
}
if (da_len(text) == 0)
error(err_line, err_col, "gettok: unrecognized character (%d) '%c'\n", the_ch, the_ch);
da_append(text, '\0');
if (isdigit(text[0])) {
if (!is_number)
error(err_line, err_col, "invalid number: %s\n", text);
n = strtol(text, NULL, 0);
if (n == LONG_MAX && errno == ERANGE)
error(err_line, err_col, "Number exceeds maximum value");
return (tok_s){Integerk, err_line, err_col, {n}};
}
return (tok_s){get_ident_type(text), err_line, err_col, {.text=text}};
}

static tok_s follow(int expect, TokenType ifyes, TokenType ifno, int err_line, int err_col) { /* look ahead for '>=', etc. */
if (the_ch == expect) {
read_ch();
return (tok_s){ifyes, err_line, err_col, {0}};
}
if (ifno == EOI) error(err_line, err_col, "follow: unrecognized character '%c' (%d)\n", the_ch, the_ch);
return (tok_s){ifno, err_line, err_col, {0}};
}

tok_s gettok() { /* return the token type */
/* skip white space */
while (isspace(the_ch))
read_ch();
int err_line = line;
int err_col = col;
switch (the_ch) {
case '{': read_ch(); return (tok_s){Lbrace, err_line, err_col, {0}};
case '}': read_ch(); return (tok_s){Rbrace, err_line, err_col, {0}};
case '(': read_ch(); return (tok_s){Lparen, err_line, err_col, {0}};
case ')': read_ch(); return (tok_s){Rparen, err_line, err_col, {0}};
case '+': read_ch(); return (tok_s){Add, err_line, err_col, {0}};
case '-': read_ch(); return (tok_s){Sub, err_line, err_col, {0}};
case '*': read_ch(); return (tok_s){Mul, err_line, err_col, {0}};
case ';': read_ch(); return (tok_s){Semi, err_line, err_col, {0}};
case ',': read_ch(); return (tok_s){Comma, err_line, err_col, {0}};
case '>': read_ch(); return (tok_s){Gtr, err_line, err_col, {0}};
case '=': read_ch(); return (tok_s){Assign, err_line, err_col, {0}};
case '/': read_ch(); return div_or_cmt(err_line, err_col);
case '\'': read_ch(); return char_lit(the_ch, err_line, err_col);
case '<': read_ch(); return follow('=', Leq, Lss, err_line, err_col);
case '!': read_ch(); return follow('=', Neq, EOI, err_line, err_col);
case '&': read_ch(); return follow('&', And, EOI, err_line, err_col);
case '"' : return string_lit(the_ch, err_line, err_col);
default: return ident_or_int(err_line, err_col);
case EOF: return (tok_s){EOI, err_line, err_col, {0}};
}
}

void init_lex() { /* initialize the scanner */
line = 1;
read_ch();
}

void run() { /* tokenize the given input */
tok_s tok;
do {
tok = gettok();
fprintf(dest_fp, "line %5d col %5d %.8s",
tok.err_ln, tok.err_col,
&"EOI Print Putc If While Lbrace Rbrace Lparen Rparen "
"Uminus Mul Div Add Sub Lss Gtr Leq Neq "
"And Semi Comma Assign Integer String Ident "[tok.tok * 9]);
if (tok.tok == Integerk)
fprintf(dest_fp, " %8d", tok.n);
else if (tok.tok == Ident)
fprintf(dest_fp, " %s", tok.text);
else if (tok.tok == Stringk)
fprintf(dest_fp, " \"%s\"", tok.text);
fprintf(dest_fp, "\n");
} while (tok.tok != EOI);
if (dest_fp != stdout)
fclose(dest_fp);
}

void init_io(FILE **fp, FILE *std, const char mode[], const char fn[]) {
if (fn[0] == '\0')
*fp = std;
else if ((*fp = fopen(fn, mode)) == NULL)
error(0, 0, "Can't open %s\n", fn);
}

int main(int argc, char *argv[]) {
init_io(&source_fp, stdin, "r", argc > 1 ? argv[1] : "");
init_io(&dest_fp, stdout, "wb", argc > 2 ? argv[2] : "");
init_lex();
run();
}
</lang>

=={{header|Python}}==
<lang Python>

import sys

# following two must remain in the same order
EOI, Print, Putc, If, While, Lbrace, Rbrace, Lparen, Rparen, Uminus, Mul, Div, Add, \
Sub, Lss, Gtr, Leq, Neq, And, Semi, Comma, Assign, Integerk, Stringk, Ident = range(25)

all_syms = [ 'EOI', 'Print', 'Putc', 'If', 'While', 'Lbrace', 'Rbrace', 'Lparen',
'Rparen', 'Uminus', 'Mul', 'Div', 'Add', 'Sub', 'Lss', 'Gtr', 'Leq', 'Neq', 'And',
'Semi', 'Comma', 'Assign', 'Integer', 'String', 'Ident' ]

# single character only symbols
symbols = { '{': Lbrace, '}': Rbrace, '(': Lparen, ')': Rparen, '+': Add, '-': Sub,
'*': Mul, ';': Semi, ',': Comma, '>': Gtr, '=': Assign }

key_words = { 'if': If, 'print': Print, 'putc': Putc, 'while': While }

the_ch = " " # dummy first char - but it must be a space
the_col = 0
the_line = 1
input_file = None

#*** show error and exit
def error(line, col, msg):
print line, col, msg
exit(1)

#*** get the next character from the input
def getc():
global the_ch, the_col, the_line

the_ch = input_file.read(1)
the_col += 1
if the_ch == '\n':
the_line += 1
the_col = 0
return the_ch

#*** 'x' - character constants
def char_lit(err_line, err_col):
n = ord(getc()) # skip opening quote
if the_ch == '\'':
error(err_line, err_col, "empty character constant")
elif the_ch == '\\':
getc()
if the_ch == 'n':
n = 10
elif the_ch == '\\':
n = '\\'
else:
error(err_line, err_col, "unknown escape sequence \\%c" % (the_ch))
if getc() != '\'':
error(err_line, err_col, "multi-character constant")
getc()
return Integerk, err_line, err_col, n

#*** process divide or comments
def div_or_cmt(err_line, err_col):
if getc() != '*':
return Div, err_line, err_col

# comment found
while True:
if getc() == '*' and getc() == '/':
getc()
return gettok()
elif len(the_ch) == 0:
error(err_line, err_col, "EOF in comment")

#*** "string"
def string_lit(start, err_line, err_col):
text = ""

while getc() != start:
if len(the_ch) == 0:
error(err_line, err_col, "EOF while scanning string literal")
if the_ch == '\n':
error(err_line, err_col, "EOL while scanning string literal")
text += the_ch

getc()
return Stringk, err_line, err_col, text

#*** handle identifiers and integers
def ident_or_int(err_line, err_col):
is_number = True
text = ""

while the_ch.isalnum() or the_ch == '_':
text += the_ch
if not the_ch.isdigit():
is_number = False
getc()

if len(text) == 0:
error(err_line, err_col, "ident_or_int: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))

if text[0].isdigit():
if not is_number:
error(err_line, err_col, "invalid number: %s" % (text))
n = int(text)
return Integerk, err_line, err_col, n

if text in key_words:
return key_words[text], err_line, err_col

return Ident, err_line, err_col, text

#*** look ahead for '>=', etc.
def follow(expect, ifyes, ifno, err_line, err_col):
if getc() == expect:
getc()
return ifyes, err_line, err_col

if ifno == EOI:
error(err_line, err_col, "follow: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))

return ifno, err_line, err_col

#*** return the next token type
def gettok():
while the_ch.isspace():
getc()

err_line = the_line
err_col = the_col

if len(the_ch) == 0: return EOI, err_line, err_col
elif the_ch in symbols: sym = symbols[the_ch]; getc(); return sym, err_line, err_col
elif the_ch == '/': return div_or_cmt(err_line, err_col)
elif the_ch == '\'': return char_lit(err_line, err_col)
elif the_ch == '<': return follow('=', Leq, Lss, err_line, err_col)
elif the_ch == '!': return follow('=', Neq, EOI, err_line, err_col)
elif the_ch == '&': return follow('&', And, EOI, err_line, err_col)
elif the_ch == '"': return string_lit(the_ch, err_line, err_col)
else: return ident_or_int(err_line, err_col)

#*** main driver
input_file = sys.stdin
if len(sys.argv) > 1:
try:
input_file = open(sys.argv[1], "r", 4096)
except IOError as e:
error(0, 0, "Can't open %s" % sys.argv[1])

while True:
t = gettok()
tok = t[0]
line = t[1]
col = t[2]

if tok == Integerk:
print "line %5d col %5d %-8s %8d" % (line, col, all_syms[tok], t[3])
elif tok == Ident:
print "line %5d col %5d %-8s %s" % (line, col, all_syms[tok], t[3])
elif tok == Stringk:
print 'line %5d col %5d %-8s "%s"' % (line, col, all_syms[tok], t[3])
else:
print "line %5d col %5d %-8s" % (line, col, all_syms[tok])

if tok == EOI:
break
</lang>

Revision as of 11:54, 10 August 2016

Hello