Compiler/lexical analyzer: Difference between revisions

m
m (→‎{{header|Wren}}: Minor tidy)
 
(38 intermediate revisions by 5 users not shown)
Line 158:
For example, the following two program fragments are equivalent, and should produce the same token stream except for the line and column positions:
 
* <langsyntaxhighlight lang="c">if ( p /* meaning n is prime */ ) {
print ( n , " " ) ;
count = count + 1 ; /* number of primes found so far */
}</langsyntaxhighlight>
* <langsyntaxhighlight lang="c">if(p){print(n," ");count=count+1;}</langsyntaxhighlight>
 
;Complete list of token names
Line 237:
| style="vertical-align:top" |
Test Case 1:
<langsyntaxhighlight lang="c">/*
Hello world
*/
print("Hello, World!\n");</langsyntaxhighlight>
 
| style="vertical-align:top" |
Line 255:
| style="vertical-align:top" |
Test Case 2:
<langsyntaxhighlight lang="c">/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, "\n");</langsyntaxhighlight>
 
| style="vertical-align:top" |
Line 280:
| style="vertical-align:top" |
Test Case 3:
<langsyntaxhighlight lang="c">/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
Line 301:
/* character literal */ '\n'
/* character literal */ '\\'
/* character literal */ ' '</langsyntaxhighlight>
 
| style="vertical-align:top" |
Line 344:
| style="vertical-align:top" |
Test Case 4:
<langsyntaxhighlight lang="c">/*** test printing, embedded \n and comments with lots of '*' ***/
print(42);
print("\nHello World\nGood Bye\nok\n");
print("Print a slash n - \\n.\n");</langsyntaxhighlight>
 
| style="vertical-align:top" |
Line 388:
 
=={{header|Ada}}==
<langsyntaxhighlight lang="ada">with Ada.Text_IO, Ada.Streams.Stream_IO, Ada.Strings.Unbounded, Ada.Command_Line,
Ada.Exceptions;
use Ada.Strings, Ada.Strings.Unbounded, Ada.Streams, Ada.Exceptions;
Line 648:
when error : others => IO.Put_Line("Error: " & Exception_Message(error));
end Main;
</syntaxhighlight>
</lang>
{{out}} Test case 3:
<pre>
Line 686:
23 1 END_OF_INPUT
</pre>
 
=={{header|ALGOL 68}}==
This is a simple ''token in, line out'' program. It doesn't keep an internal representation of tokens or anything like that, since that's not needed at all.
 
As an addition, it emits a diagnostic if integer literals are too big.
<syntaxhighlight lang="algol68">BEGIN
# implement C-like getchar, where EOF and EOLn are "characters" (-1 and 10 resp.). #
INT eof = -1, eoln = 10;
BOOL eof flag := FALSE;
STRING buf := "";
INT col := 1;
INT line := 0;
on logical file end (stand in, (REF FILE f)BOOL: eof flag := TRUE);
PROC getchar = INT:
IF eof flag THEN eof
ELIF col = UPB buf THEN col +:= 1; eoln
ELIF col > UPB buf THEN IF line > 0 THEN read(newline) FI;
line +:= 1;
read(buf);
IF eof flag THEN col := 1; eof
ELSE col := 0; getchar
FI
ELSE col +:= 1; ABS buf[col]
FI;
PROC nextchar = INT: IF eof flag THEN eof ELIF col >= UPB buf THEN eoln ELSE ABS buf[col+1] FI;
 
PROC is blank = (INT ch) BOOL: ch = 0 OR ch = 9 OR ch = 10 OR ch = 13 OR ch = ABS " ";
PROC is digit = (INT ch) BOOL: ch >= ABS "0" AND ch <= ABS "9";
PROC is ident start = (INT ch) BOOL: ch >= ABS "A" AND ch <= ABS "Z" OR
ch >= ABS "a" AND ch <= ABS "z" OR
ch = ABS "_";
PROC is ident = (INT ch) BOOL: is ident start(ch) OR is digit(ch);
 
PROC ident or keyword = (INT start char) VOID:
BEGIN
STRING w := REPR start char;
INT start col = col;
WHILE is ident (next char) DO w +:= REPR getchar OD;
IF w = "if" THEN output2("Keyword_if", start col)
ELIF w = "else" THEN output2("Keyword_else", start col)
ELIF w = "while" THEN output2("Keyword_while", start col)
ELIF w = "print" THEN output2("Keyword_print", start col)
ELIF w = "putc" THEN output2("Keyword_putc", start col)
ELSE output2("Identifier " + w, start col)
FI
END;
PROC char = VOID:
BEGIN
INT start col = col;
INT ch := getchar;
IF ch = ABS "'" THEN error("Empty character constant")
ELIF ch = ABS "\" THEN ch := getchar;
IF ch = ABS "n" THEN ch := 10
ELIF ch = ABS "\" THEN SKIP
ELSE error("Unknown escape sequence. \" + REPR ch)
FI
FI;
IF nextchar /= ABS "'" THEN error("Multi-character constant.") FI;
getchar;
output2("Integer " + whole(ch, 0), start col)
END;
PROC string = VOID:
BEGIN
INT start col = col;
STRING s := """";
WHILE INT ch := getchar; ch /= ABS """"
DO
IF ch = eoln THEN error("End-of-line while scanning string literal. Closing string character not found before end-of-line.")
ELIF ch = eof THEN error("End-of-file while scanning string literal. Closing string character not found.")
ELIF ch = ABS "\" THEN s +:= REPR ch; ch := getchar;
IF ch /= ABS "\" AND ch /= ABS "n" THEN error("Unknown escape sequence. \" + REPR ch) FI;
s +:= REPR ch
ELSE s +:= REPR ch
FI
OD;
output2("String " + s + """", start col)
END;
PROC comment = VOID:
BEGIN
WHILE INT ch := getchar; NOT (ch = ABS "*" AND nextchar = ABS "/")
DO IF ch = eof THEN error("End-of-file in comment. Closing comment characters not found.") FI
OD;
getchar
END;
PROC number = (INT first digit) VOID:
BEGIN
INT start col = col;
INT n := first digit - ABS "0";
WHILE is digit (nextchar) DO
INT u := getchar - ABS "0";
IF LENG n * 10 + LENG u > max int THEN error("Integer too big") FI;
n := n * 10 + u
OD;
IF is ident start (nextchar) THEN error("Invalid number. Starts like a number, but ends in non-numeric characters.") FI;
output2("Integer " + whole(n, 0), start col)
END;
 
PROC output = (STRING s) VOID: output2(s, col);
PROC output2 = (STRING s, INT col) VOID: print((whole(line,-8), whole(col,-8), " ", s, newline));
 
PROC if follows = (CHAR second, STRING longer, shorter) VOID:
IF nextchar = ABS second
THEN output(longer); getchar
ELSE output(shorter)
FI;
PROC error = (STRING s)VOID: (put(stand error, ("At ", whole(line,0), ":", whole(col,0), " ", s, new line)); stop);
PROC unrecognized = (INT char) VOID: error("Unrecognized character " + REPR char);
PROC double char = (INT first, STRING op) VOID:
IF nextchar /= first THEN unrecognized(first)
ELSE output2(op, col-1); getchar
FI;
 
WHILE INT ch := getchar; ch /= eof
DO
IF is blank(ch) THEN SKIP
ELIF ch = ABS "(" THEN output("LeftParen")
ELIF ch = ABS ")" THEN output("RightParen")
ELIF ch = ABS "{" THEN output("LeftBrace")
ELIF ch = ABS "}" THEN output("RightBrace")
ELIF ch = ABS ";" THEN output("Semicolon")
ELIF ch = ABS "," THEN output("Comma")
ELIF ch = ABS "*" THEN output("Op_multiply")
ELIF ch = ABS "/" THEN IF next char = ABS "*" THEN comment
ELSE output("Op_divide")
FI
ELIF ch = ABS "%" THEN output("Op_mod")
ELIF ch = ABS "+" THEN output("Op_add")
ELIF ch = ABS "-" THEN output("Op_subtract")
ELIF ch = ABS "<" THEN if follows("=", "Op_lessequal", "Op_less")
ELIF ch = ABS ">" THEN if follows("=", "Op_greaterequal", "Op_greater")
ELIF ch = ABS "=" THEN if follows("=", "Op_equal", "Op_assign")
ELIF ch = ABS "!" THEN if follows("=", "Op_notequal", "Op_not")
ELIF ch = ABS "&" THEN double char(ch, "Op_and")
ELIF ch = ABS "|" THEN double char(ch, "Op_or")
ELIF is ident start (ch) THEN ident or keyword (ch)
ELIF ch = ABS """" THEN string
ELIF ch = ABS "'" THEN char
ELIF is digit(ch) THEN number(ch)
ELSE unrecognized(ch)
FI
OD;
output("End_Of_Input")
END</syntaxhighlight>
 
=={{header|ALGOL W}}==
<langsyntaxhighlight lang="algolw">begin
%lexical analyser %
% Algol W strings are limited to 256 characters in length so we limit source lines %
Line 981 ⟶ 1,124:
while nextToken not = tEnd_of_input do writeToken;
writeToken
end.</langsyntaxhighlight>
{{out}} Test case 3:
<pre>
Line 1,024 ⟶ 1,167:
One interesting feature of this implementation is my liberal use of a pushback buffer for input characters. This kept the code modular and easier to write.
 
(One point of note: the C "EOF" pseudo-character is detected in the following code by looking for a negative number. That EOF has to be negative and the other characters non-negative is implied by the ISO C standard.)
<lang ATS>(********************************************************************)
 
<syntaxhighlight lang="ats">(********************************************************************)
(* Usage: lex [INPUTFILE [OUTPUTFILE]]
If INPUTFILE or OUTPUTFILE is "-" or missing, then standard input
Line 1,539 ⟶ 1,684:
val cn = ch.column_no
in
case+if int2char0 (ch.ichar) of< 0 then
| ',' => ((TOKEN_COMMATOKEN_END_OF_INPUT, ",", ln, cn), inp)
else
| ';' => ((TOKEN_SEMICOLON, ";", ln, cn), inp)
| '\(' =>case+ ((TOKEN_LEFTPAREN,int2char0 "(", ln, cnch.ichar), inp)of
| '),' => ((TOKEN_RIGHTPARENTOKEN_COMMA, "),", ln, cn), inp)
| '\{;' => ((TOKEN_LEFTBRACETOKEN_SEMICOLON, "{;", ln, cn), inp)
| '}\(' => ((TOKEN_RIGHTBRACETOKEN_LEFTPAREN, "}(", ln, cn), inp)
| '*)' => ((TOKEN_MULTIPLYTOKEN_RIGHTPAREN, "*)", ln, cn), inp)
| '/\{' => ((TOKEN_DIVIDETOKEN_LEFTBRACE, "/{", ln, cn), inp)
| '%}' => ((TOKEN_MODTOKEN_RIGHTBRACE, "%}", ln, cn), inp)
| '+*' => ((TOKEN_ADDTOKEN_MULTIPLY, "+*", ln, cn), inp)
| '-/' => ((TOKEN_SUBTRACTTOKEN_DIVIDE, "-/", ln, cn), inp)
| '<%' => ((TOKEN_MOD, "%", ln, cn), inp)
| '+' => ((TOKEN_ADD, "+", ln, cn), inp)
let
| '-' val=> (ch1(TOKEN_SUBTRACT, inp)"-", =ln, get_chcn), inp)
in| '<' =>
let
if (ch1.ichar) = char2i '=' then
val ((TOKEN_LESSEQUALch1, "<inp) =", ln, cn),get_ch inp)
elsein
letif (ch1.ichar) = char2i '=' then
val inp((TOKEN_LESSEQUAL, "<=", push_back_chln, (ch1cn), inp)
inelse
((TOKEN_LESS, "<", ln, cn), inp)let
end val inp = push_back_ch (ch1, inp)
end in
((TOKEN_LESS, "<", ln, cn), inp)
| '>' =>
let end
val (ch1, inp) = get_ch inpend
in| '>' =>
let
if (ch1.ichar) = char2i '=' then
val ((TOKEN_GREATEREQUALch1, ">inp) =", ln, cn),get_ch inp)
elsein
letif (ch1.ichar) = char2i '=' then
val inp((TOKEN_GREATEREQUAL, ">=", push_back_chln, (ch1cn), inp)
inelse
((TOKEN_GREATER, ">", ln, cn), inp)let
end val inp = push_back_ch (ch1, inp)
end in
((TOKEN_GREATER, ">", ln, cn), inp)
| '=' =>
let end
val (ch1, inp) = get_ch inpend
in| '=' =>
let
if (ch1.ichar) = char2i '=' then
val ((TOKEN_EQUALch1, "inp) ==", ln, cn),get_ch inp)
elsein
letif (ch1.ichar) = char2i '=' then
val inp((TOKEN_EQUAL, "==", push_back_chln, (ch1cn), inp)
inelse
((TOKEN_ASSIGN, "=", ln, cn), inp)let
end val inp = push_back_ch (ch1, inp)
end in
((TOKEN_ASSIGN, "=", ln, cn), inp)
| '!' =>
let end
val (ch1, inp) = get_ch inpend
in| '!' =>
let
if (ch1.ichar) = char2i '=' then
val ((TOKEN_NOTEQUALch1, "!inp) =", ln, cn),get_ch inp)
elsein
letif (ch1.ichar) = char2i '=' then
val inp((TOKEN_NOTEQUAL, "!=", push_back_chln, (ch1cn), inp)
inelse
((TOKEN_NOT, "!", ln, cn), inp)let
end val inp = push_back_ch (ch1, inp)
end in
((TOKEN_NOT, "!", ln, cn), inp)
| '&' =>
let end
val (ch1, inp) = get_ch inpend
in| '&' =>
let
if (ch1.ichar) = char2i '&' then
val ((TOKEN_ANDch1, "&&",inp) ln,= cn),get_ch inp)
elsein
$raise unexpected_characterif (chch1.line_no,ichar) ch.column_no,= char2i '&' then
((TOKEN_AND, "&&", ln, cn), ch.icharinp)
end else
$raise unexpected_character (ch.line_no, ch.column_no,
| '|' =>
ch.ichar)
let
val (ch1, inp) = get_ch inpend
in| '|' =>
let
if (ch1.ichar) = char2i '|' then
val ((TOKEN_ORch1, "||",inp) ln,= cn),get_ch inp)
elsein
$raise unexpected_characterif (chch1.line_no,ichar) ch.column_no,= char2i '|' then
((TOKEN_OR, "||", ln, cn), ch.icharinp)
end else
$raise unexpected_character (ch.line_no, ch.column_no,
| '"' =>
ch.ichar)
let
end
val inp = push_back_ch (ch, inp)
in| '"' =>
scan_string_literal inplet
val inp = push_back_ch (ch, inp)
end
| '\'' => in
scan_string_literal inp
let
end
val inp = push_back_ch (ch, inp)
in| '\'' =>
scan_character_literal inplet
val inp = push_back_ch (ch, inp)
end
| _ when isdigit (ch.ichar) =>in
scan_character_literal inp
let
end
val inp = push_back_ch (ch, inp)
| _ when isdigit (ch.ichar) =>
in
let
scan_integer_literal (inp, lookups)
val inp = push_back_ch (ch, inp)
end
in
| _ when is_ident_start (ch.ichar) =>
scan_integer_literal (inp, lookups)
let
end
val inp = push_back_ch (ch, inp)
| _ when is_ident_start (ch.ichar) =>
in
let
scan_identifier_or_reserved_word (inp, lookups)
val inp = push_back_ch (ch, inp)
end
in
| _ => $raise unexpected_character (ch.line_no, ch.column_no,
scan_identifier_or_reserved_word (inp, lookups)
ch.ichar)
end
| _ => $raise unexpected_character (ch.line_no, ch.column_no,
ch.ichar)
end
 
Line 1,702 ⟶ 1,850:
lookups : !lookups_vt) : void =
let
val (toktup, inp) = skip_spaces_and_commentsget_next_token (inp, lookups)
val (ch, inp) = get_ch inp
val ln = ch.line_no
val cn = ch.column_no
in
ifprint_token (ch.ichar)outf, <toktup, 0 thenlookups);
if toktup.0 print_token<> (outf, (TOKEN_END_OF_INPUT, "", ln, cn),then
loop (inp, lookups)
else
let
val inp = push_back_ch (ch, inp)
val (toktup, inp) = get_next_token (inp, lookups)
in
print_token (outf, toktup, lookups);
loop (inp, lookups)
end
end
in
Line 1,904 ⟶ 2,041:
end
 
(********************************************************************)</langsyntaxhighlight>
 
{{out}}
Line 1,945 ⟶ 2,082:
=={{header|AWK}}==
Tested with gawk 4.1.1 and mawk 1.3.4.
<syntaxhighlight lang="awk">
<lang AWK>
BEGIN {
all_syms["tk_EOI" ] = "End_of_input"
Line 2,151 ⟶ 2,288:
}
}
</syntaxhighlight>
</lang>
{{out|case=count}}
<b>
Line 2,188 ⟶ 2,325:
=={{header|C}}==
Tested with gcc 4.81 and later, compiles warning free with -Wpedantic -pedantic -Wall -Wextra
<langsyntaxhighlight Clang="c">#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
Line 2,420 ⟶ 2,557:
run();
return 0;
}</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 2,464 ⟶ 2,601:
=={{header|C sharp|C#}}==
Requires C#6.0 because of the use of null coalescing operators.
<langsyntaxhighlight lang="csharp">
using System;
using System.IO;
Line 2,814 ⟶ 2,951:
}
}
</syntaxhighlight>
</lang>
 
{{out|case=test case 3}}
Line 2,858 ⟶ 2,995:
=={{header|C++}}==
Tested with GCC 9.3.0 (g++ -std=c++17)
<langsyntaxhighlight lang="cpp">#include <charconv> // std::from_chars
#include <fstream> // file_to_string, string_to_file
#include <functional> // std::invoke
Line 3,243 ⟶ 3,380:
});
}
</syntaxhighlight>
</lang>
 
{{out|case=test case 3}}
Line 3,288 ⟶ 3,425:
Using GnuCOBOL 2. By Steve Williams (with one change to get around a Rosetta Code code highlighter problem).
 
<langsyntaxhighlight lang="cobol"> >>SOURCE FORMAT IS FREE
*> this code is dedicated to the public domain
*> (GnuCOBOL) 2.3-dev.0
Line 3,694 ⟶ 3,831:
end-if
.
end program lexer.</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 3,736 ⟶ 3,873:
Lisp has a built-in reader and you can customize the reader by modifying its readtable. I'm also using the Gray stream, which is an almost standard feature of Common Lisp, for counting lines and columns.
 
<langsyntaxhighlight lang="lisp">(defpackage #:lexical-analyzer
(:use #:cl #:sb-gray)
(:export #:main))
Line 3,949 ⟶ 4,086:
 
(defun main ()
(lex *standard-input*))</langsyntaxhighlight>
{{out|case=test case 3}}
<pre> 5 16 KEYWORD-PRINT
Line 3,985 ⟶ 4,122:
22 26 INTEGER 32
23 1 END-OF-INPUT </pre>
 
=={{header|Elixir}}==
{{works with|Elixir|1.13.3}}
{{trans|ATS}}
 
<syntaxhighlight lang="elixir">#!/bin/env elixir
# -*- elixir -*-
 
defmodule Lex do
 
def main args do
{inpf_name, outf_name, exit_status} =
case args do
[] -> {"-", "-", 0}
[name] -> {name, "-", 0}
[name1, name2] -> {name1, name2, 0}
[name1, name2 | _] -> {name1, name2, usage_error()}
end
 
{inpf, outf, exit_status} =
case {inpf_name, outf_name, exit_status} do
{"-", "-", 0} -> {:stdio, :stdio, 0}
{name1, "-", 0} ->
{inpf, exit_status} = open_file(name1, [:read])
{inpf, :stdio, exit_status}
{"-", name2, 0} ->
{outf, exit_status} = open_file(name2, [:write])
{:stdio, outf, exit_status}
{name1, name2, 0} ->
{inpf, exit_status} = open_file(name1, [:read])
if exit_status != 0 do
{inpf, name2, exit_status}
else
{outf, exit_status} = open_file(name2, [:write])
{inpf, outf, exit_status}
end
_ -> {inpf_name, outf_name, exit_status}
end
 
exit_status =
case exit_status do
0 -> main_program inpf, outf
_ -> exit_status
end
 
# Choose one.
System.halt exit_status # Fast exit.
#System.stop exit_status # Laborious cleanup.
end
 
def main_program inpf, outf do
inp = make_inp inpf
scan_text outf, inp
exit_status = 0
exit_status
end
 
def open_file name, rw do
case File.open name, rw do
{:ok, f} -> {f, 0}
_ ->
IO.write :stderr, "Cannot open "
IO.write :stderr, name
case rw do
[:read] -> IO.puts " for input"
[:write] -> IO.puts " for output"
end
{name, 1}
end
end
 
def scan_text outf, inp do
{toktup, inp} = get_next_token inp
print_token outf, toktup
case toktup do
{"End_of_input", _, _, _} -> :ok
_ -> scan_text outf, inp
end
end
 
def print_token outf, {tok, arg, line_no, column_no} do
IO.write outf, (String.pad_leading "#{line_no}", 5)
IO.write outf, " "
IO.write outf, (String.pad_leading "#{column_no}", 5)
IO.write outf, " "
IO.write outf, tok
case tok do
"Identifier" ->
IO.write outf, " "
IO.write outf, arg
"Integer" ->
IO.write outf, " "
IO.write outf, arg
"String" ->
IO.write outf, " "
IO.write outf, arg
_ -> :ok
end
IO.puts outf, ""
end
 
###-------------------------------------------------------------------
###
### The token dispatcher.
###
 
def get_next_token inp do
inp = skip_spaces_and_comments inp
{ch, inp} = get_ch inp
{chr, line_no, column_no} = ch
ln = line_no
cn = column_no
case chr do
:eof -> {{"End_of_input", "", ln, cn}, inp}
"," -> {{"Comma", ",", ln, cn}, inp}
";" -> {{"Semicolon", ";", ln, cn}, inp}
"(" -> {{"LeftParen", "(", ln, cn}, inp}
")" -> {{"RightParen", ")", ln, cn}, inp}
"{" -> {{"LeftBrace", "{", ln, cn}, inp}
"}" -> {{"RightBrace", "}", ln, cn}, inp}
"*" -> {{"Op_multiply", "*", ln, cn}, inp}
"/" -> {{"Op_divide", "/", ln, cn}, inp}
"%" -> {{"Op_mod", "%", ln, cn}, inp}
"+" -> {{"Op_add", "+", ln, cn}, inp}
"-" -> {{"Op_subtract", "-", ln, cn}, inp}
"<" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"=" -> {{"Op_lessequal", "<=", ln, cn}, inp}
_ -> {{"Op_less", "<", ln, cn}, (push_back ch1, inp)}
end
">" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"=" -> {{"Op_greaterequal", ">=", ln, cn}, inp}
_ -> {{"Op_greater", ">", ln, cn}, (push_back ch1, inp)}
end
"=" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"=" -> {{"Op_equal", "==", ln, cn}, inp}
_ -> {{"Op_assign", "=", ln, cn}, (push_back ch1, inp)}
end
"!" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"=" -> {{"Op_notequal", "!=", ln, cn}, inp}
_ -> {{"Op_not", "!", ln, cn}, (push_back ch1, inp)}
end
"&" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"&" -> {{"Op_and", "&&", ln, cn}, inp}
_ -> unexpected_character ln, cn, chr
end
"|" ->
{ch1, inp} = get_ch inp
{chr1, _, _} = ch1
case chr1 do
"|" -> {{"Op_or", "||", ln, cn}, inp}
_ -> unexpected_character ln, cn, chr
end
"\"" ->
inp = push_back ch, inp
scan_string_literal inp
"'" ->
inp = push_back ch, inp
scan_character_literal inp
_ ->
cond do
String.match? chr, ~r/^[[:digit:]]$/u ->
inp = push_back ch, inp
scan_integer_literal inp
String.match? chr, ~r/^[[:alpha:]_]$/u ->
inp = push_back ch, inp
scan_identifier_or_reserved_word inp
true -> unexpected_character ln, cn, chr
end
end
end
 
###-------------------------------------------------------------------
###
### Skipping past spaces and /* ... */ comments.
###
### Comments are treated exactly like a bit of whitespace. They never
### make it to the dispatcher.
###
 
def skip_spaces_and_comments inp do
{ch, inp} = get_ch inp
{chr, line_no, column_no} = ch
cond do
chr == :eof -> push_back ch, inp
String.match? chr, ~r/^[[:space:]]$/u ->
skip_spaces_and_comments inp
chr == "/" ->
{ch1, inp} = get_ch inp
case ch1 do
{"*", _, _} ->
inp = scan_comment inp, line_no, column_no
skip_spaces_and_comments inp
_ -> push_back ch, (push_back ch1, inp)
end
true -> push_back ch, inp
end
end
 
def scan_comment inp, line_no, column_no do
{ch, inp} = get_ch inp
case ch do
{:eof, _, _} -> unterminated_comment line_no, column_no
{"*", _, _} ->
{ch1, inp} = get_ch inp
case ch1 do
{:eof, _, _} -> unterminated_comment line_no, column_no
{"/", _, _} -> inp
_ -> scan_comment inp, line_no, column_no
end
_ -> scan_comment inp, line_no, column_no
end
end
 
###-------------------------------------------------------------------
###
### Scanning of integer literals, identifiers, and reserved words.
###
### These three types of token are very similar to each other.
###
 
def scan_integer_literal inp do
# Scan an entire word, not just digits. This way we detect
# erroneous text such as "23skidoo".
{line_no, column_no, inp} = get_position inp
{word, inp} = scan_word inp
if String.match? word, (~r/^[[:digit:]]+$/u) do
{{"Integer", word, line_no, column_no}, inp}
else
invalid_integer_literal line_no, column_no, word
end
end
 
def scan_identifier_or_reserved_word inp do
# It is assumed that the first character is of the correct type,
# thanks to the dispatcher.
{line_no, column_no, inp} = get_position inp
{word, inp} = scan_word inp
tok =
case word do
"if" -> "Keyword_if"
"else" -> "Keyword_else"
"while" -> "Keyword_while"
"print" -> "Keyword_print"
"putc" -> "Keyword_putc"
_ -> "Identifier"
end
{{tok, word, line_no, column_no}, inp}
end
 
def scan_word inp, word\\"" do
{ch, inp} = get_ch inp
{chr, _, _} = ch
if String.match? chr, (~r/^[[:alnum:]_]$/u) do
scan_word inp, (word <> chr)
else
{word, (push_back ch, inp)}
end
end
 
def get_position inp do
{ch, inp} = get_ch inp
{_, line_no, column_no} = ch
inp = push_back ch, inp
{line_no, column_no, inp}
end
 
###-------------------------------------------------------------------
###
### Scanning of string literals.
###
### It is assumed that the first character is the opening quote, and
### that the closing quote is the same character.
###
 
def scan_string_literal inp do
{ch, inp} = get_ch inp
{quote_mark, line_no, column_no} = ch
{contents, inp} = scan_str_lit inp, ch
{{"String", quote_mark <> contents <> quote_mark,
line_no, column_no},
inp}
end
 
def scan_str_lit inp, ch, contents\\"" do
{quote_mark, line_no, column_no} = ch
{ch1, inp} = get_ch inp
{chr1, line_no1, column_no1} = ch1
if chr1 == quote_mark do
{contents, inp}
else
case chr1 do
:eof -> eoi_in_string_literal line_no, column_no
"\n" -> eoln_in_string_literal line_no, column_no
"\\" ->
{ch2, inp} = get_ch inp
{chr2, _, _} = ch2
case chr2 do
"n" -> scan_str_lit inp, ch, (contents <> "\\n")
"\\" -> scan_str_lit inp, ch, (contents <> "\\\\")
_ -> unsupported_escape line_no1, column_no1, chr2
end
_ -> scan_str_lit inp, ch, (contents <> chr1)
end
end
end
 
###-------------------------------------------------------------------
###
### Scanning of character literals.
###
### It is assumed that the first character is the opening quote, and
### that the closing quote is the same character.
###
### The tedious part of scanning a character literal is distinguishing
### between the kinds of lexical error. (One might wish to modify the
### code to detect, as a distinct kind of error, end of line within a
### character literal.)
###
 
def scan_character_literal inp do
{ch, inp} = get_ch inp
{_, line_no, column_no} = ch
{ch1, inp} = get_ch inp
{chr1, line_no1, column_no1} = ch1
{intval, inp} =
case chr1 do
:eof -> unterminated_character_literal line_no, column_no
"\\" ->
{ch2, inp} = get_ch inp
{chr2, _, _} = ch2
case chr2 do
:eof -> unterminated_character_literal line_no, column_no
"n" -> {(:binary.first "\n"), inp}
"\\" -> {(:binary.first "\\"), inp}
_ -> unsupported_escape line_no1, column_no1, chr2
end
_ -> {(:binary.first chr1), inp}
end
inp = check_character_literal_end inp, ch
{{"Integer", "#{intval}", line_no, column_no}, inp}
end
 
def check_character_literal_end inp, ch do
{chr, _, _} = ch
{{chr1, _, _}, inp} = get_ch inp
if chr1 == chr do
inp
else
# Lexical error.
find_char_lit_end inp, ch
end
end
 
def find_char_lit_end inp, ch do
{chr, line_no, column_no} = ch
{{chr1, _, _}, inp} = get_ch inp
if chr1 == chr do
multicharacter_literal line_no, column_no
else
case chr1 do
:eof -> unterminated_character_literal line_no, column_no
_ -> find_char_lit_end inp, ch
end
end
end
 
###-------------------------------------------------------------------
###
### Character-at-a-time input, with unrestricted pushback, and with
### line and column numbering.
###
 
def make_inp inpf do
{inpf, [], 1, 1}
end
 
def get_ch {inpf, pushback, line_no, column_no} do
case pushback do
[head | tail] ->
{head, {inpf, tail, line_no, column_no}}
[] ->
case IO.read(inpf, 1) do
:eof ->
{{:eof, line_no, column_no},
{inpf, pushback, line_no, column_no}}
{:error, _} ->
{{:eof, line_no, column_no},
{inpf, pushback, line_no, column_no}}
chr ->
case chr do
"\n" ->
{{chr, line_no, column_no},
{inpf, pushback, line_no + 1, 1}}
_ ->
{{chr, line_no, column_no},
{inpf, pushback, line_no, column_no + 1}}
end
end
end
end
 
def push_back ch, {inpf, pushback, line_no, column_no} do
{inpf, [ch | pushback], line_no, column_no}
end
 
###-------------------------------------------------------------------
###
### Lexical and usage errors.
###
 
def unterminated_comment line_no, column_no do
raise "#{scriptname()}: unterminated comment at #{line_no}:#{column_no}"
end
 
def invalid_integer_literal line_no, column_no, word do
raise "#{scriptname()}: invalid integer literal #{word} at #{line_no}:#{column_no}"
end
 
def unsupported_escape line_no, column_no, chr do
raise "#{scriptname()}: unsupported escape \\#{chr} at #{line_no}:#{column_no}"
end
 
def eoi_in_string_literal line_no, column_no do
raise "#{scriptname()}: end of input in string literal starting at #{line_no}:#{column_no}"
end
 
def eoln_in_string_literal line_no, column_no do
raise "#{scriptname()}: end of line in string literal starting at #{line_no}:#{column_no}"
end
 
def multicharacter_literal line_no, column_no do
raise "#{scriptname()}: unsupported multicharacter literal at #{line_no}:#{column_no}"
end
 
def unterminated_character_literal line_no, column_no do
raise "#{scriptname()}: unterminated character literal starting at #{line_no}:#{column_no}"
end
 
def unexpected_character line_no, column_no, chr do
raise "#{scriptname()}: unexpected character '#{chr}' at #{line_no}:#{column_no}"
end
 
def usage_error() do
IO.puts "Usage: #{scriptname()} [INPUTFILE [OUTPUTFILE]]"
IO.puts "If either of INPUTFILE or OUTPUTFILE is not present or is \"-\","
IO.puts "standard input or standard output is used, respectively."
exit_status = 2
exit_status
end
 
def scriptname() do
Path.basename(__ENV__.file)
end
 
#---------------------------------------------------------------------
 
end ## module Lex
 
Lex.main(System.argv)</syntaxhighlight>
 
{{out}}
<pre>$ ./lex testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
 
 
=={{header|Emacs Lisp}}==
{{works with|Emacs|GNU 27.2}}
{{trans|ATS}}
 
 
<syntaxhighlight lang="lisp">#!/usr/bin/emacs --script
;;
;; The Rosetta Code lexical analyzer in GNU Emacs Lisp.
;;
;; Migrated from the ATS. However, Emacs Lisp is not friendly to the
;; functional style of the ATS implementation; therefore the
;; differences are vast.
;;
;; (A Scheme migration could easily, on the other hand, have been
;; almost exact. It is interesting to contrast Lisp dialects and see
;; how huge the differences are.)
;;
;; The script currently takes input only from standard input and
;; writes the token stream only to standard output.
;;
 
(require 'cl-lib)
 
;;; The type of a character, consisting of its code point and where it
;;; occurred in the text.
(cl-defstruct (ch_t (:constructor make-ch (ichar line-no column-no)))
ichar line-no column-no)
 
(defun ch-ichar (ch)
(ch_t-ichar ch))
 
(defun ch-line-no (ch)
(ch_t-line-no ch))
 
(defun ch-column-no (ch)
(ch_t-column-no ch))
 
;;; The type of an "inputter", consisting of an open file for the
;;; text, a pushback buffer (which is an indefinitely deep stack of
;;; ch_t), an input buffer for the current line, and a position in the
;;; text.
(cl-defstruct inp_t file pushback line line-no column-no)
 
(defun make-inp (file)
"Initialize a new inp_t."
(make-inp_t :file file
:pushback '()
:line ""
:line-no 0
:column-no 0))
 
(defvar inp (make-inp t)
"A global inp_t.")
 
(defun get-ch ()
"Get a ch_t, either from the pushback buffer or from the input."
(pcase (inp_t-pushback inp)
(`(,ch . ,tail)
;; Emacs Lisp has only single value return, so the results come
;; back as a list rather than multiple values.
(setq inp (make-inp_t :file (inp_t-file inp)
:pushback tail
:line (inp_t-line inp)
:line-no (inp_t-line-no inp)
:column-no (inp_t-column-no inp)))
ch)
('()
(let ((line (inp_t-line inp))
(line-no (inp_t-line-no inp))
(column-no (inp_t-column-no inp)))
(when (string= line "")
;; Refill the buffer.
(let ((text
(condition-case nil (read-string "")
nil (error 'eoi))))
(if (eq text 'eoi)
(setq line 'eoi)
(setq line (format "%s%c" text ?\n)))
(setq line-no (1+ line-no))
(setq column-no 1)))
(if (eq line 'eoi)
(progn
(setq inp (make-inp_t :file (inp_t-file inp)
:pushback (inp_t-pushback inp)
:line line
:line-no line-no
:column-no column-no))
(make-ch 'eoi line-no column-no))
(let ((c (elt line 0))
(line (substring line 1)))
(setq inp (make-inp_t :file (inp_t-file inp)
:pushback (inp_t-pushback inp)
:line line
:line-no line-no
:column-no (1+ column-no)))
(make-ch c line-no column-no)))))))
 
(defun get-new-line (file)
;; Currently "file" is ignored and the input must be from stdin.
(read-from-minibuffer "" :default 'eoi))
 
(defun push-back (ch)
"Push back a ch_t."
(setq inp (make-inp_t :file (inp_t-file inp)
:pushback (cons ch (inp_t-pushback inp))
:line (inp_t-line inp)
:line-no (inp_t-line-no inp)
:column-no (inp_t-column-no inp))))
 
(defun get-position ()
"Return the line-no and column-no of the next ch_t to be
returned by get-ch, assuming there are no more pushbacks
beforehand."
(let* ((ch (get-ch))
(line-no (ch-line-no ch))
(column-no (ch-column-no ch)))
(push-back ch)
(list line-no column-no)))
 
(defun scan-text (outf)
"The main loop."
(cl-loop for toktup = (get-next-token)
do (print-token outf toktup)
until (string= (elt toktup 0) "End_of_input")))
 
(defun print-token (outf toktup)
"Print a token, along with its position and possibly an
argument."
;; Currently outf is ignored, and the output goes to stdout.
(pcase toktup
(`(,tok ,arg ,line-no ,column-no)
(princ (format "%5d %5d %s" line-no column-no tok))
(pcase tok
("Identifier" (princ (format " %s\n" arg)))
("Integer" (princ (format " %s\n" arg)))
("String" (princ (format " %s\n" arg)))
(_ (princ "\n"))))))
 
(defun get-next-token ()
"The token dispatcher. Returns the next token, as a list along
with its argument and text position."
(skip-spaces-and-comments)
(let* ((ch (get-ch))
(ln (ch-line-no ch))
(cn (ch-column-no ch)))
(pcase (ch-ichar ch)
('eoi (list "End_of_input" "" ln cn))
(?, (list "Comma" "," ln cn))
(?\N{SEMICOLON} (list "Semicolon" ";" ln cn))
(?\N{LEFT PARENTHESIS} (list "LeftParen" "(" ln cn))
(?\N{RIGHT PARENTHESIS} (list "RightParen" ")" ln cn))
(?{ (list "LeftBrace" "{" ln cn))
(?} (list "RightBrace" "}" ln cn))
(?* (list "Op_multiply" "*" ln cn))
(?/ (list "Op_divide" "/" ln cn))
(?% (list "Op_mod" "%" ln cn))
(?+ (list "Op_add" "+" ln cn))
(?- (list "Op_subtract" "-" ln cn))
(?< (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?= (list "Op_lessequal" "<=" ln cn))
(_ (push-back ch1)
(list "Op_less" "<" ln cn)))))
(?> (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?= (list "Op_greaterequal" ">=" ln cn))
(_ (push-back ch1)
(list "Op_greater" ">" ln cn)))))
(?= (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?= (list "Op_equal" "==" ln cn))
(_ (push-back ch1)
(list "Op_assign" "=" ln cn)))))
(?! (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?= (list "Op_notequal" "!=" ln cn))
(_ (push-back ch1)
(list "Op_not" "!" ln cn)))))
(?& (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?& (list "Op_and" "&&" ln cn))
(_ (unexpected-character ln cn (get-ichar ch))))))
(?| (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?| (list "Op_or" "||" ln cn))
(_ (unexpected-character ln cn (get-ichar ch))))))
(?\N{QUOTATION MARK} (push-back ch) (scan-string-literal))
(?\N{APOSTROPHE} (push-back ch) (scan-character-literal))
((pred digitp) (push-back ch) (scan-integer-literal))
((pred identifier-start-p)
(progn
(push-back ch)
(scan-identifier-or-reserved-word)))
(c (unexpected-character ln cn c)))))
 
(defun skip-spaces-and-comments ()
"Skip spaces and comments. A comment is treated as equivalent
to a run of spaces."
(cl-loop for ch = (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?/ (let* ((ch2 (get-ch))
(line-no (ch-line-no ch1))
(column-no (ch-column-no ch1))
(position `(,line-no ,column-no)))
(pcase (ch-ichar ch2)
(?* (scan-comment position)
(get-ch))
(_ (push-back ch2)
ch1))))
(_ ch1)))
while (spacep (ch-ichar ch))
finally do (push-back ch)))
 
(defun scan-comment (position)
(cl-loop for ch = (get-ch)
for done = (comment-done-p ch position)
until done))
 
(defun comment-done-p (ch position)
(pcase (ch-ichar ch)
('eoi (apply 'unterminated-comment position))
(?* (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
('eoi (apply 'unterminated-comment position))
(?/ t)
(_ nil))))
(_ nil)))
 
(defun scan-integer-literal ()
"Scan an integer literal, on the assumption that a digit has
been seen and pushed back."
(let* ((position (get-position))
(lst (scan-word))
(s (list-to-string lst)))
(if (all-digits-p lst)
`("Integer" ,s . ,position)
(apply 'illegal-integer-literal `(,@position , s)))))
 
(defun scan-identifier-or-reserved-word ()
"Scan an identifier or reserved word, on the assumption that a
legal first character (for an identifier) has been seen and
pushed back."
(let* ((position (get-position))
(lst (scan-word))
(s (list-to-string lst))
(tok (pcase s
("else" "Keyword_else")
("if" "Keyword_if")
("while" "Keyword_while")
("print" "Keyword_print")
("putc" "Keyword_putc")
(_ "Identifier"))))
`(,tok ,s . ,position)))
 
(defun scan-word ()
(cl-loop for ch = (get-ch)
while (identifier-continuation-p (ch-ichar ch))
collect (ch-ichar ch)
finally do (push-back ch)))
 
(defun scan-string-literal ()
"Scan a string literal, on the assumption that a double quote
has been seen and pushed back."
(let* ((ch (get-ch))
(_ (cl-assert (= (ch-ichar ch) ?\N{QUOTATION MARK})))
(line-no (ch-line-no ch))
(column-no (ch-column-no ch))
(position `(,line-no ,column-no))
(lst (scan-str-lit position))
(lst `(?\N{QUOTATION MARK} ,@lst ?\N{QUOTATION MARK})))
`("String" ,(list-to-string lst) . ,position)))
 
(defun scan-str-lit (position)
(flatten
(cl-loop for ch = (get-ch)
until (= (ch-ichar ch) ?\N{QUOTATION MARK})
collect (process-str-lit-character
(ch-ichar ch) position))))
 
(defun process-str-lit-character (c position)
;; NOTE: This script might insert a newline before any eoi, so that
;; "end-of-input-in-string-literal" never actually occurs. It is a
;; peculiarity of the script's input mechanism.
(pcase c
('eoi (apply 'end-of-input-in-string-literal position))
(?\n (apply 'end-of-line-in-string-literal position))
(?\\ (let ((ch1 (get-ch)))
(pcase (ch-ichar ch1)
(?n '(?\\ ?n))
(?\\ '(?\\ ?\\))
(c (unsupported-escape (ch-line-no ch1)
(ch-column-no ch1)
c)))))
(c c)))
 
(defun scan-character-literal ()
"Scan a character literal, on the assumption that an ASCII
single quote (that is, a Unicode APOSTROPHE) has been seen and
pushed back."
(let* ((toktup (scan-character-literal-without-checking-end))
(line-no (elt toktup 2))
(column-no (elt toktup 3))
(position (list line-no column-no)))
(check-char-lit-end position)
toktup))
 
(defun check-char-lit-end (position)
(let ((ch (get-ch)))
(unless (and (integerp (ch-ichar ch))
(= (ch-ichar ch) ?\N{APOSTROPHE}))
(push-back ch)
(loop-to-char-lit-end position))))
 
(defun loop-to-char-lit-end (position)
(cl-loop for ch = (get-ch)
until (or (eq (ch-ichar ch) 'eoi)
(= (ch-ichar ch) ?\N{APOSTROPHE}))
finally do (if (eq (ch-ichar ch) 'eoi)
(apply 'unterminated-character-literal
position)
(apply 'multicharacter-literal position))))
 
(defun scan-character-literal-without-checking-end ()
(let* ((ch (get-ch))
(_ (cl-assert (= (ch-ichar ch) ?\N{APOSTROPHE})))
(line-no (ch-line-no ch))
(column-no (ch-column-no ch))
(position (list line-no column-no))
(ch1 (get-ch)))
(pcase (ch-ichar ch1)
('eoi (apply 'unterminated-character-literal position))
(?\\ (let ((ch2 (get-ch)))
(pcase (ch-ichar ch2)
('eoi (apply 'unterminated-character-literal position))
(?n `("Integer" ,(format "%d" ?\n) . ,position))
(?\\ `("Integer" ,(format "%d" ?\\) . ,position))
(c (unsupported-escape (ch-line-no ch1)
(ch-column-no ch1)
c)))))
(c `("Integer" ,(format "%d" c) . ,position)))))
 
(defun spacep (c)
(and (integerp c) (or (= c ?\N{SPACE})
(and (<= 9 c) (<= c 13)))))
 
(defun digitp (c)
(and (integerp c) (<= ?0 c) (<= c ?9)))
 
(defun lowerp (c)
;; Warning: in EBCDIC, this kind of test for "alphabetic" is no
;; good. The letters are not contiguous.
(and (integerp c) (<= ?a c) (<= c ?z)))
 
(defun upperp (c)
;; Warning: in EBCDIC, this kind of test for "alphabetic" is no
;; good. The letters are not contiguous.
(and (integerp c) (<= ?A c) (<= c ?Z)))
 
(defun alphap (c)
(or (lowerp c) (upperp c)))
 
(defun identifier-start-p (c)
(and (integerp c) (or (alphap c) (= c ?_))))
 
(defun identifier-continuation-p (c)
(and (integerp c) (or (alphap c) (= c ?_) (digitp c))))
 
(defun all-digits-p (thing)
(cl-loop for c in thing
if (not (digitp c)) return nil
finally return t))
 
(defun list-to-string (lst)
"Convert a list of characters to a string."
(apply 'string lst))
 
(defun flatten (lst)
"Flatten nested lists. (The implementation is recursive and not
for very long lists.)"
(pcase lst
('() '())
(`(,head . ,tail)
(if (listp head)
(append (flatten head) (flatten tail))
(cons head (flatten tail))))))
 
(defun unexpected-character (line-no column-no c)
(error (format "unexpected character '%c' at %d:%d"
c line-no column-no)))
 
(defun unsupported-escape (line-no column-no c)
(error (format "unsupported escape \\%c at %d:%d"
c line-no column-no)))
 
(defun illegal-integer-literal (line-no column-no s)
(error (format "illegal integer literal \"%s\" at %d:%d"
s line-no column-no)))
 
(defun unterminated-character-literal (line-no column-no)
(error (format "unterminated character literal starting at %d:%d"
line-no column-no)))
 
(defun multicharacter-literal (line-no column-no)
(error (format
"unsupported multicharacter literal starting at %d:%d"
line-no column-no)))
 
(defun end-of-input-in-string-literal (line-no column-no)
(error (format "end of input in string literal starting at %d:%d"
line-no column-no)))
 
(defun end-of-line-in-string-literal (line-no column-no)
(error (format "end of line in string literal starting at %d:%d"
line-no column-no)))
 
(defun unterminated-comment (line-no column-no)
(error (format "unterminated comment starting at %d:%d"
line-no column-no)))
 
(defun main ()
(setq inp (make-inp t))
(scan-text t))
 
(main)</syntaxhighlight>
 
 
{{out}}
<pre>$ ./lex-in-el < compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|Erlang}}==
{{works with|Erlang|24.3.3}}
{{trans|ATS}}
{{trans|Elixir}}
 
 
<syntaxhighlight lang="erlang">#!/bin/env escript
%%%-------------------------------------------------------------------
 
-record (inp_t, {inpf, pushback, line_no, column_no}).
 
main (Args) ->
main_program (Args).
 
main_program ([]) ->
scan_from_inpf_to_outf ("-", "-"),
halt (0);
main_program ([Inpf_filename]) ->
scan_from_inpf_to_outf (Inpf_filename, "-"),
halt (0);
main_program ([Inpf_filename, Outf_filename]) ->
scan_from_inpf_to_outf (Inpf_filename, Outf_filename),
halt (0);
main_program ([_, _ | _]) ->
ProgName = escript:script_name (),
io:put_chars (standard_error, "Usage: "),
io:put_chars (standard_error, ProgName),
io:put_chars (standard_error, " [INPUTFILE [OUTPUTFILE]]\n"),
halt (1).
 
scan_from_inpf_to_outf ("-", "-") ->
scan_input (standard_io, standard_io);
scan_from_inpf_to_outf (Inpf_filename, "-") ->
case file:open (Inpf_filename, [read]) of
{ok, Inpf} -> scan_input (Inpf, standard_io);
_ -> open_failure (Inpf_filename, "input")
end;
scan_from_inpf_to_outf ("-", Outf_filename) ->
case file:open (Outf_filename, [write]) of
{ok, Outf} -> scan_input (standard_io, Outf);
_ -> open_failure (Outf_filename, "output")
end;
scan_from_inpf_to_outf (Inpf_filename, Outf_filename) ->
case file:open(Inpf_filename, [read]) of
{ok, Inpf} ->
case file:open (Outf_filename, [write]) of
{ok, Outf} -> scan_input (Inpf, Outf);
_ -> open_failure (Outf_filename, "output")
end;
_ -> open_failure (Inpf_filename, "input")
end.
 
open_failure (Filename, ForWhat) ->
ProgName = escript:script_name (),
io:put_chars (standard_error, ProgName),
io:put_chars (standard_error, ": failed to open \""),
io:put_chars (standard_error, Filename),
io:put_chars (standard_error, "\" for "),
io:put_chars (standard_error, ForWhat),
io:put_chars (standard_error, "\n"),
halt (1).
 
scan_input (Inpf, Outf) ->
scan_text (Outf, make_inp (Inpf)).
 
scan_text (Outf, Inp) ->
{TokTup, Inp1} = get_next_token (Inp),
print_token (Outf, TokTup),
case TokTup of
{"End_of_input", _, _, _} -> ok;
_ -> scan_text (Outf, Inp1)
end.
 
print_token (Outf, {Tok, Arg, Line_no, Column_no}) ->
S_line_no = erlang:integer_to_list (Line_no),
S_column_no = erlang:integer_to_list (Column_no),
io:put_chars (Outf, string:pad (S_line_no, 5, leading)),
io:put_chars (Outf, " "),
io:put_chars (Outf, string:pad (S_column_no, 5, leading)),
io:put_chars (Outf, " "),
io:put_chars (Outf, Tok),
{Padding, Arg1} =
case Tok of
"Identifier" -> {" ", Arg};
"Integer" -> {" ", Arg};
"String" -> {" ", Arg};
_ -> {"", ""}
end,
io:put_chars (Outf, Padding),
io:put_chars (Outf, Arg1),
io:put_chars ("\n").
 
%%%-------------------------------------------------------------------
%%%
%%% The token dispatcher.
%%%
 
get_next_token (Inp) ->
Inp00 = skip_spaces_and_comments (Inp),
{Ch, Inp0} = get_ch (Inp00),
{Char, Line_no, Column_no} = Ch,
Ln = Line_no,
Cn = Column_no,
case Char of
eof -> {{"End_of_input", "", Ln, Cn}, Inp0};
"," -> {{"Comma", ",", Ln, Cn}, Inp0};
";" -> {{"Semicolon", ";", Ln, Cn}, Inp0};
"(" -> {{"LeftParen", "(", Ln, Cn}, Inp0};
")" -> {{"RightParen", ")", Ln, Cn}, Inp0};
"{" -> {{"LeftBrace", "{", Ln, Cn}, Inp0};
"}" -> {{"RightBrace", "}", Ln, Cn}, Inp0};
"*" -> {{"Op_multiply", "*", Ln, Cn}, Inp0};
"/" -> {{"Op_divide", "/", Ln, Cn}, Inp0};
"%" -> {{"Op_mod", "%", Ln, Cn}, Inp0};
"+" -> {{"Op_add", "+", Ln, Cn}, Inp0};
"-" -> {{"Op_subtract", "-", Ln, Cn}, Inp0};
"<" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"=" -> {{"Op_lessequal", "<=", Ln, Cn}, Inp1};
_ -> {{"Op_less", "<", Ln, Cn}, push_back (Ch1, Inp1)}
end;
">" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"=" -> {{"Op_greaterequal", ">=", Ln, Cn}, Inp1};
_ -> {{"Op_greater", ">", Ln, Cn}, push_back (Ch1, Inp1)}
end;
"=" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"=" -> {{"Op_equal", "==", Ln, Cn}, Inp1};
_ -> {{"Op_assign", "=", Ln, Cn}, push_back (Ch1, Inp1)}
end;
"!" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"=" -> {{"Op_notequal", "!=", Ln, Cn}, Inp1};
_ -> {{"Op_not", "!", Ln, Cn}, push_back (Ch1, Inp1)}
end;
"&" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"&" -> {{"Op_and", "&&", Ln, Cn}, Inp1};
_ -> unexpected_character (Ln, Cn, Char)
end;
"|" ->
{Ch1, Inp1} = get_ch (Inp0),
{Char1, _, _} = Ch1,
case Char1 of
"|" -> {{"Op_or", "||", Ln, Cn}, Inp1};
_ -> unexpected_character (Ln, Cn, Char)
end;
"\"" ->
Inp1 = push_back (Ch, Inp0),
scan_string_literal (Inp1);
"'" ->
Inp1 = push_back (Ch, Inp0),
scan_character_literal (Inp1);
_ ->
case is_digit (Char) of
true ->
Inp1 = push_back (Ch, Inp0),
scan_integer_literal (Inp1);
false ->
case is_alpha_or_underscore (Char) of
true ->
Inp1 = push_back (Ch, Inp0),
scan_identifier_or_reserved_word (Inp1);
false ->
unexpected_character (Ln, Cn, Char)
end
end
end.
 
%%%-------------------------------------------------------------------
%%%
%%% Skipping past spaces and /* ... */ comments.
%%%
%%% Comments are treated exactly like a bit of whitespace. They never
%%% make it to the dispatcher.
%%%
 
skip_spaces_and_comments (Inp) ->
{Ch, Inp0} = get_ch (Inp),
{Char, Line_no, Column_no} = Ch,
case classify_char (Char) of
eof -> push_back (Ch, Inp0);
space -> skip_spaces_and_comments (Inp0);
slash ->
{Ch1, Inp1} = get_ch (Inp0),
case Ch1 of
{"*", _, _} ->
Inp2 = scan_comment (Inp1, Line_no, Column_no),
skip_spaces_and_comments (Inp2);
_ -> push_back (Ch, (push_back (Ch1, Inp1)))
end;
other -> push_back (Ch, Inp0)
end.
 
classify_char (Char) ->
case Char of
eof -> eof;
"/" -> slash;
_ -> case is_space (Char) of
true -> space;
false -> other
end
end.
 
scan_comment (Inp, Line_no, Column_no) ->
{Ch0, Inp0} = get_ch (Inp),
case Ch0 of
{eof, _, _} -> unterminated_comment (Line_no, Column_no);
{"*", _, _} ->
{Ch1, Inp1} = get_ch (Inp0),
case Ch1 of
{eof, _, _} ->
unterminated_comment (Line_no, Column_no);
{"/", _, _} -> Inp1;
_ -> scan_comment (Inp1, Line_no, Column_no)
end;
_ -> scan_comment (Inp0, Line_no, Column_no)
end.
 
is_space (S) ->
case re:run (S, "^[[:space:]]+$") of
{match, _} -> true;
_ -> false
end.
 
%%%-------------------------------------------------------------------
%%%
%%% Scanning of integer literals, identifiers, and reserved words.
%%%
%%% These three types of token are very similar to each other.
%%%
 
scan_integer_literal (Inp) ->
%% Scan an entire word, not just digits. This way we detect
%% erroneous text such as "23skidoo".
{Line_no, Column_no, Inp1} = get_position (Inp),
{Word, Inp2} = scan_word (Inp1),
case is_digit (Word) of
true -> {{"Integer", Word, Line_no, Column_no}, Inp2};
false -> invalid_integer_literal (Line_no, Column_no, Word)
end.
 
scan_identifier_or_reserved_word (Inp) ->
%% It is assumed that the first character is of the correct type,
%% thanks to the dispatcher.
{Line_no, Column_no, Inp1} = get_position (Inp),
{Word, Inp2} = scan_word (Inp1),
Tok =
case Word of
"if" -> "Keyword_if";
"else" -> "Keyword_else";
"while" -> "Keyword_while";
"print" -> "Keyword_print";
"putc" -> "Keyword_putc";
_ -> "Identifier"
end,
{{Tok, Word, Line_no, Column_no}, Inp2}.
 
scan_word (Inp) ->
scan_word_loop (Inp, "").
 
scan_word_loop (Inp, Word0) ->
{Ch1, Inp1} = get_ch (Inp),
{Char1, _, _} = Ch1,
case is_alnum_or_underscore (Char1) of
true -> scan_word_loop (Inp1, Word0 ++ Char1);
false -> {Word0, push_back (Ch1, Inp1)}
end.
 
get_position (Inp) ->
{Ch1, Inp1} = get_ch (Inp),
{_, Line_no, Column_no} = Ch1,
Inp2 = push_back (Ch1, Inp1),
{Line_no, Column_no, Inp2}.
 
is_digit (S) ->
case re:run (S, "^[[:digit:]]+$") of
{match, _} -> true;
_ -> false
end.
 
is_alpha_or_underscore (S) ->
case re:run (S, "^[[:alpha:]_]+$") of
{match, _} -> true;
_ -> false
end.
 
is_alnum_or_underscore (S) ->
case re:run (S, "^[[:alnum:]_]+$") of
{match, _} -> true;
_ -> false
end.
 
%%%-------------------------------------------------------------------
%%%
%%% Scanning of string literals.
%%%
%%% It is assumed that the first character is the opening quote, and
%%% that the closing quote is the same character.
%%%
 
 
scan_string_literal (Inp) ->
{Ch1, Inp1} = get_ch (Inp),
{Quote_mark, Line_no, Column_no} = Ch1,
{Contents, Inp2} = scan_str_lit (Inp1, Ch1),
Toktup = {"String", Quote_mark ++ Contents ++ Quote_mark,
Line_no, Column_no},
{Toktup, Inp2}.
 
scan_str_lit (Inp, Ch) -> scan_str_lit_loop (Inp, Ch, "").
 
scan_str_lit_loop (Inp, Ch, Contents) ->
{Quote_mark, Line_no, Column_no} = Ch,
{Ch1, Inp1} = get_ch (Inp),
{Char1, Line_no1, Column_no1} = Ch1,
case Char1 of
Quote_mark -> {Contents, Inp1};
eof -> eoi_in_string_literal (Line_no, Column_no);
"\n" -> eoln_in_string_literal (Line_no, Column_no);
"\\" ->
{Ch2, Inp2} = get_ch (Inp1),
{Char2, _, _} = Ch2,
case Char2 of
"n" ->
scan_str_lit_loop (Inp2, Ch, Contents ++ "\\n");
"\\" ->
scan_str_lit_loop (Inp2, Ch, Contents ++ "\\\\");
_ ->
unsupported_escape (Line_no1, Column_no1, Char2)
end;
_ -> scan_str_lit_loop (Inp1, Ch, Contents ++ Char1)
end.
 
%%%-------------------------------------------------------------------
%%%
%%% Scanning of character literals.
%%%
%%% It is assumed that the first character is the opening quote, and
%%% that the closing quote is the same character.
%%%
%%% The tedious part of scanning a character literal is distinguishing
%%% between the kinds of lexical error. (One might wish to modify the
%%% code to detect, as a distinct kind of error, end of line within a
%%% character literal.)
%%%
 
scan_character_literal (Inp) ->
{Ch, Inp0} = get_ch (Inp),
{_, Line_no, Column_no} = Ch,
{Ch1, Inp1} = get_ch (Inp0),
{Char1, Line_no1, Column_no1} = Ch1,
{Intval, Inp3} =
case Char1 of
eof -> unterminated_character_literal (Line_no, Column_no);
"\\" ->
{Ch2, Inp2} = get_ch (Inp1),
{Char2, _, _} = Ch2,
case Char2 of
eof -> unterminated_character_literal (Line_no,
Column_no);
"n" -> {char_to_code ("\n"), Inp2};
"\\" -> {char_to_code ("\\"), Inp2};
_ -> unsupported_escape (Line_no1, Column_no1,
Char2)
end;
_ -> {char_to_code (Char1), Inp1}
end,
Inp4 = check_character_literal_end (Inp3, Ch),
{{"Integer", Intval, Line_no, Column_no}, Inp4}.
 
char_to_code (Char) ->
%% Hat tip to https://archive.ph/BxZRS
lists:flatmap (fun erlang:integer_to_list/1, Char).
 
check_character_literal_end (Inp, Ch) ->
{Char, _, _} = Ch,
{{Char1, _, _}, Inp1} = get_ch (Inp),
case Char1 of
Char -> Inp1;
_ -> find_char_lit_end (Inp1, Ch) % Handle a lexical error.
end.
 
find_char_lit_end (Inp, Ch) ->
%% There is a lexical error. Determine which kind it fits into.
{Char, Line_no, Column_no} = Ch,
{{Char1, _, _}, Inp1} = get_ch (Inp),
case Char1 of
Char -> multicharacter_literal (Line_no, Column_no);
eof -> unterminated_character_literal (Line_no, Column_no);
_ -> find_char_lit_end (Inp1, Ch)
end.
 
%%%-------------------------------------------------------------------
%%%
%%% Character-at-a-time input, with unrestricted pushback, and with
%%% line and column numbering.
%%%
 
make_inp (Inpf) ->
#inp_t{inpf = Inpf,
pushback = [],
line_no = 1,
column_no = 1}.
 
get_ch (Inp) ->
#inp_t{inpf = Inpf,
pushback = Pushback,
line_no = Line_no,
column_no = Column_no} = Inp,
case Pushback of
[Ch | Tail] ->
Inp1 = Inp#inp_t{pushback = Tail},
{Ch, Inp1};
[] ->
case io:get_chars (Inpf, "", 1) of
eof ->
Ch = {eof, Line_no, Column_no},
{Ch, Inp};
{error, _} ->
Ch = {eof, Line_no, Column_no},
{Ch, Inp};
Char ->
case Char of
"\n" ->
Ch = {Char, Line_no, Column_no},
Inp1 = Inp#inp_t{line_no = Line_no + 1,
column_no = 1},
{Ch, Inp1};
_ ->
Ch = {Char, Line_no, Column_no},
Inp1 =
Inp#inp_t{column_no = Column_no + 1},
{Ch, Inp1}
end
end
end.
 
push_back (Ch, Inp) ->
Inp#inp_t{pushback = [Ch | Inp#inp_t.pushback]}.
 
%%%-------------------------------------------------------------------
 
invalid_integer_literal (Line_no, Column_no, Word) ->
error_abort ("invalid integer literal \"" ++
Word ++ "\" at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
unsupported_escape (Line_no, Column_no, Char) ->
error_abort ("unsupported escape \\" ++
Char ++ " at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
unexpected_character (Line_no, Column_no, Char) ->
error_abort ("unexpected character '" ++
Char ++ "' at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
eoi_in_string_literal (Line_no, Column_no) ->
error_abort ("end of input in string literal starting at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
eoln_in_string_literal (Line_no, Column_no) ->
error_abort ("end of line in string literal starting at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
unterminated_character_literal (Line_no, Column_no) ->
error_abort ("unterminated character literal starting at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
multicharacter_literal (Line_no, Column_no) ->
error_abort ("unsupported multicharacter literal starting at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
unterminated_comment (Line_no, Column_no) ->
error_abort ("unterminated comment starting at " ++
integer_to_list (Line_no) ++ ":" ++
integer_to_list (Column_no)).
 
error_abort (Message) ->
ProgName = escript:script_name (),
io:put_chars (standard_error, ProgName),
io:put_chars (standard_error, ": "),
io:put_chars (standard_error, Message),
io:put_chars (standard_error, "\n"),
halt (1).
 
%%%-------------------------------------------------------------------
%%% Instructions to GNU Emacs --
%%% local variables:
%%% mode: erlang
%%% erlang-indent-level: 3
%%% end:
%%%-------------------------------------------------------------------</syntaxhighlight>
 
 
{{out}}
<pre>$ ./lex-in-Erlang compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|Euphoria}}==
Tested with Euphoria 4.05.
<langsyntaxhighlight lang="euphoria">include std/io.e
include std/map.e
include std/types.e
Line 4,213 ⟶ 5,877:
end procedure
 
main(command_line())</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 4,257 ⟶ 5,921:
=={{header|Flex}}==
Tested with Flex 2.5.4.
<syntaxhighlight lang="c">%{
<lang C>%{
#include <stdio.h>
#include <stdlib.h>
Line 4,430 ⟶ 6,094:
} while (tok != tk_EOI);
return 0;
}</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 4,474 ⟶ 6,138:
=={{header|Forth}}==
Tested with Gforth 0.7.3.
<langsyntaxhighlight Forthlang="forth">CREATE BUF 0 , \ single-character look-ahead buffer
CREATE COLUMN# 0 ,
CREATE LINE# 1 ,
Line 4,596 ⟶ 6,260:
THEN THEN ;
: TOKENIZE BEGIN CONSUME AGAIN ;
TOKENIZE</langsyntaxhighlight>
 
{{out}}
Line 4,610 ⟶ 6,274:
 
The author has placed this Fortran code in the public domain.
<syntaxhighlight lang="fortran">!!!
<lang Fortran>!!!
!!! An implementation of the Rosetta Code lexical analyzer task:
!!! https://rosettacode.org/wiki/Compiler/lexical_analyzer
Line 5,688 ⟶ 7,352:
end subroutine print_usage
end program lex</langsyntaxhighlight>
 
{{out}}
Line 5,729 ⟶ 7,393:
=={{header|FreeBASIC}}==
Tested with FreeBASIC 1.05
<langsyntaxhighlight FreeBASIClang="freebasic">enum Token_type
tk_EOI
tk_Mul
Line 6,015 ⟶ 7,679:
print : print "Hit any to end program"
sleep
system</langsyntaxhighlight>
{{out|case=test case 3}}
<b>
Line 6,056 ⟶ 7,720:
=={{header|Go}}==
{{trans|FreeBASIC}}
<langsyntaxhighlight lang="go">package main
 
import (
Line 6,433 ⟶ 8,097:
initLex()
process()
}</langsyntaxhighlight>
 
{{out}}
Line 6,476 ⟶ 8,140:
=={{header|Haskell}}==
Tested with GHC 8.0.2
<langsyntaxhighlight lang="haskell">import Control.Applicative hiding (many, some)
import Control.Monad.State.Lazy
import Control.Monad.Trans.Maybe (MaybeT, runMaybeT)
Line 6,780 ⟶ 8,444:
where (Just t, s') = runState (runMaybeT lexer) s
(txt, _, _) = s'
</syntaxhighlight>
</lang>
 
{{out|case=test case 3}}
Line 6,832 ⟶ 8,496:
Global variables are avoided except for some constants that require initialization.
 
<syntaxhighlight lang="icon">#
<lang Icon>#
# The Rosetta Code lexical analyzer in Icon with co-expressions. Based
# upon the ATS implementation.
Line 6,891 ⟶ 8,555:
inpf := &input
outf := &output
if 1 <= *args & args[1] ~== "-" then inpf := open(args[1], "rt"){
if 2 <= *args & args[2] ~== "-" then outfinpf := open(args[21], "wtrt") |
stop("cannot open ", args[1], " for input")
}
if 2 <= *args & args[2] ~== "-" then {
outf := open(args[2], "wt") |
stop("cannot open ", args[2], " for output")
}
 
pushback_buffer := []
Line 7,061 ⟶ 8,731:
toktup := scan_character_literal_without_checking_end(inp)
ch1 := @inp
while EOF ~=== ch1[1] &if ch1[1] ~=== close_quote dothen {
case ch1[1] ofrepeat {
case ch1[1] of {
EOF : unterminated_character_literal(line_no, column_no)
close_quote EOF : multicharacter_literalunterminated_character_literal(line_no, column_no)
close_quote : multicharacter_literal(line_no, column_no)
default : ch1 := @inp
default : ch1 := @inp
}
}
}
Line 7,151 ⟶ 8,823:
if ch[1] == "*" then {
ch1 := @inp
(EOF === chch1[1]) & unterminated_comment(line_no, column_no)
}
}
Line 7,322 ⟶ 8,994:
procedure max(x, y)
return (if x < y then y else x)
end</langsyntaxhighlight>
 
 
Line 7,371 ⟶ 9,043:
Implementation:
 
<langsyntaxhighlight Jlang="j">symbols=:256#0
ch=: {{1 0+x[symbols=: x (a.i.y)} symbols}}
'T0 token' =: 0 ch '%+-!(){};,<>=!|&'
Line 7,491 ⟶ 9,163:
keep=. (tokens~:<,'''')*-.comments+.whitespace+.unknown*a:=values
keep&#each ((1+lines),.columns);<names,.values
}}</langsyntaxhighlight>
 
Test case 3:
 
<syntaxhighlight lang="j">
<lang J>
flex=: {{
'A B'=.y
Line 7,561 ⟶ 9,233:
21 28 Integer 92
22 27 Integer 32
23 1 End_of_input </langsyntaxhighlight>
 
Here, it seems expedient to retain a structured representation of the lexical result. As shown, it's straightforward to produce a "pure" textual result for a hypothetical alternative implementation of the syntax analyzer, but the structured representation will be easier to deal with.
 
=={{header|Java}}==
<langsyntaxhighlight lang="java">
// Translated from python source
 
Line 7,716 ⟶ 9,388:
if (text.equals("")) {
error(line, pos, String.format("identifer_or_integer unrecopgnizedunrecognized character: (%d) %c", (int)this.chr, this.chr));
}
if (Character.isDigit(text.charAt(0))) {
if (!is_number) {
error(line, pos, String.format("invaslidinvalid number: %s", text));
}
return new Token(TokenType.Integer, text, line, pos);
Line 7,807 ⟶ 9,479:
}
}
</syntaxhighlight>
</lang>
 
=={{header|JavaScript}}==
{{incorrect|Javascript|Please show output. Code is identical to [[Compiler/syntax_analyzer]] task}}
<langsyntaxhighlight lang="javascript">
/*
Token: type, value, line, pos
Line 8,024 ⟶ 9,696:
l.printTokens()
})
</syntaxhighlight>
</lang>
 
=={{header|Julia}}==
<langsyntaxhighlight lang="julia">struct Tokenized
startline::Int
startcol::Int
Line 8,182 ⟶ 9,854:
println(lpad(tok.startline, 3), lpad(tok.startcol, 5), lpad(tok.name, 18), " ", tok.value != nothing ? tok.value : "")
end
</langsyntaxhighlight>{{output}}<pre>
Line Col Name Value
5 16 Keyword_print
Line 8,219 ⟶ 9,891:
23 1 End_of_input
</pre>
=={{header|kotlin}}==
{{trans|Java}}
<syntaxhighlight lang="kotlin">// Input: command line argument of file to process or console input. A two or
// three character console input of digits followed by a new line will be
// checked for an integer between zero and twenty-five to select a fixed test
// case to run. Any other console input will be parsed.
 
// Code based on the Java version found here:
// https://rosettacode.org/mw/index.php?title=Compiler/lexical_analyzer&action=edit&section=22
 
// Class to halt the parsing with an exception.
class ParsingFailed(message: String): Exception(message)
 
// Enumerate class of tokens supported by this scanner.
enum class TokenType {
Tk_End_of_input, Op_multiply, Op_divide, Op_mod, Op_add, Op_subtract,
Op_negate, Op_not, Op_less, Op_lessequal, Op_greater, Op_greaterequal,
Op_equal, Op_notequal, Op_assign, Op_and, Op_or, Kw_if,
Kw_else, Kw_while, Kw_print, Kw_putc, Sy_LeftParen, Sy_RightParen,
Sy_LeftBrace, Sy_RightBrace, Sy_Semicolon, Sy_Comma, Tk_Identifier,
Tk_Integer, Tk_String;
 
override fun toString() =
listOf("End_of_input", "Op_multiply", "Op_divide", "Op_mod", "Op_add",
"Op_subtract", "Op_negate", "Op_not", "Op_less", "Op_lessequal",
"Op_greater", "Op_greaterequal", "Op_equal", "Op_notequal",
"Op_assign", "Op_and", "Op_or", "Keyword_if", "Keyword_else",
"Keyword_while", "Keyword_print", "Keyword_putc", "LeftParen",
"RightParen", "LeftBrace", "RightBrace", "Semicolon", "Comma",
"Identifier", "Integer", "String")[this.ordinal]
} // TokenType
 
// Data class of tokens returned by the scanner.
data class Token(val token: TokenType, val value: String, val line: Int,
val pos: Int) {
 
// Overridden method to display the token.
override fun toString() =
"%5d %5d %-15s %s".format(line, pos, this.token,
when (this.token) {
TokenType.Tk_Integer, TokenType.Tk_Identifier ->
" %s".format(this.value)
TokenType.Tk_String ->
this.value.toList().joinToString("", " \"", "\"") {
when (it) {
'\t' ->
"\\t"
'\n' ->
"\\n"
'\u000b' ->
"\\v"
'\u000c' ->
"\\f"
'\r' ->
"\\r"
'"' ->
"\\\""
'\\' ->
"\\"
in ' '..'~' ->
"$it"
else ->
"\\u%04x".format(it.code) } }
else ->
"" } )
} // Token
 
// Function to display an error message and halt the scanner.
fun error(line: Int, pos: Int, msg: String): Nothing =
throw ParsingFailed("(%d, %d) %s\n".format(line, pos, msg))
 
// Class to process the source into tokens with properties of the
// source string, the line number, the column position, the index
// within the source string, the current character being processed,
// and map of the keyword strings to the corresponding token type.
class Lexer(private val s: String) {
private var line = 1
private var pos = 1
private var position = 0
private var chr =
if (s.isEmpty())
' '
else
s[0]
private val keywords = mapOf<String, TokenType>(
"if" to TokenType.Kw_if,
"else" to TokenType.Kw_else,
"print" to TokenType.Kw_print,
"putc" to TokenType.Kw_putc,
"while" to TokenType.Kw_while)
 
// Method to retrive the next character from the source. Use null after
// the end of our source.
private fun getNextChar() =
if (++this.position >= this.s.length) {
this.pos++
this.chr = '\u0000'
this.chr
} else {
this.pos++
this.chr = this.s[this.position]
when (this.chr) {
'\n' -> {
this.line++
this.pos = 0
} // line
'\t' ->
while (this.pos%8 != 1)
this.pos++
} // when
this.chr
} // if
 
// Method to return the division token, skip the comment, or handle the
// error.
private fun div_or_comment(line: Int, pos: Int): Token =
if (getNextChar() != '*')
Token(TokenType.Op_divide, "", line, pos);
else {
getNextChar() // Skip comment start
outer@
while (true)
when (this.chr) {
'\u0000' ->
error(line, pos, "Lexer: EOF in comment");
'*' ->
if (getNextChar() == '/') {
getNextChar() // Skip comment end
break@outer
} // if
else ->
getNextChar()
} // when
getToken()
} // if
 
// Method to verify a character literal. Return the token or handle the
// error.
private fun char_lit(line: Int, pos: Int): Token {
var c = getNextChar() // skip opening quote
when (c) {
'\'' ->
error(line, pos, "Lexer: Empty character constant");
'\\' ->
c = when (getNextChar()) {
'n' ->
10.toChar()
'\\' ->
'\\'
'\'' ->
'\''
else ->
error(line, pos, "Lexer: Unknown escape sequence '\\%c'".
format(this.chr)) }
} // when
if (getNextChar() != '\'')
error(line, pos, "Lexer: Multi-character constant")
getNextChar() // Skip closing quote
return Token(TokenType.Tk_Integer, c.code.toString(), line, pos)
} // char_lit
 
// Method to check next character to see whether it belongs to the token
// we might be in the middle of. Return the correct token or handle the
// error.
private fun follow(expect: Char, ifyes: TokenType, ifno: TokenType,
line: Int, pos: Int): Token =
when {
getNextChar() == expect -> {
getNextChar()
Token(ifyes, "", line, pos)
} // matches
ifno == TokenType.Tk_End_of_input ->
error(line, pos,
"Lexer: %c expected: (%d) '%c'".format(expect,
this.chr.code, this.chr))
else ->
Token(ifno, "", line, pos)
} // when
 
// Method to verify a character string. Return the token or handle the
// error.
private fun string_lit(start: Char, line: Int, pos: Int): Token {
var result = ""
while (getNextChar() != start)
when (this.chr) {
'\u0000' ->
error(line, pos, "Lexer: EOF while scanning string literal")
'\n' ->
error(line, pos, "Lexer: EOL while scanning string literal")
'\\' ->
when (getNextChar()) {
'\\' ->
result += '\\'
'n' ->
result += '\n'
'"' ->
result += '"'
else ->
error(line, pos, "Lexer: Escape sequence unknown '\\%c'".
format(this.chr))
} // when
else ->
result += this.chr
} // when
getNextChar() // Toss closing quote
return Token(TokenType.Tk_String, result, line, pos)
} // string_lit
 
// Method to retrive an identifier or integer. Return the keyword
// token, if the string matches one. Return the integer token,
// if the string is all digits. Return the identifer token, if the
// string is valid. Otherwise, handle the error.
private fun identifier_or_integer(line: Int, pos: Int): Token {
var is_number = true
var text = ""
while (this.chr in listOf('_')+('0'..'9')+('a'..'z')+('A'..'Z')) {
text += this.chr
is_number = is_number && this.chr in '0'..'9'
getNextChar()
} // while
if (text.isEmpty())
error(line, pos, "Lexer: Unrecognized character: (%d) %c".
format(this.chr.code, this.chr))
return when {
text[0] in '0'..'9' ->
if (!is_number)
error(line, pos, "Lexer: Invalid number: %s".
format(text))
else {
val max = Int.MAX_VALUE.toString()
if (text.length > max.length || (text.length == max.length &&
max < text))
error(line, pos,
"Lexer: Number exceeds maximum value %s".
format(text))
Token(TokenType.Tk_Integer, text, line, pos)
} // if
this.keywords.containsKey(text) ->
Token(this.keywords[text]!!, "", line, pos)
else ->
Token(TokenType.Tk_Identifier, text, line, pos) }
} // identifier_or_integer
 
// Method to skip whitespace both C's and Unicode ones and retrive the next
// token.
private fun getToken(): Token {
while (this.chr in listOf('\t', '\n', '\u000b', '\u000c', '\r', ' ') ||
this.chr.isWhitespace())
getNextChar()
val line = this.line
val pos = this.pos
return when (this.chr) {
'\u0000' ->
Token(TokenType.Tk_End_of_input, "", line, pos)
'/' ->
div_or_comment(line, pos)
'\'' ->
char_lit(line, pos)
'<' ->
follow('=', TokenType.Op_lessequal, TokenType.Op_less, line, pos)
'>' ->
follow('=', TokenType.Op_greaterequal, TokenType.Op_greater, line, pos)
'=' ->
follow('=', TokenType.Op_equal, TokenType.Op_assign, line, pos)
'!' ->
follow('=', TokenType.Op_notequal, TokenType.Op_not, line, pos)
'&' ->
follow('&', TokenType.Op_and, TokenType.Tk_End_of_input, line, pos)
'|' ->
follow('|', TokenType.Op_or, TokenType.Tk_End_of_input, line, pos)
'"' ->
string_lit(this.chr, line, pos)
'{' -> {
getNextChar()
Token(TokenType.Sy_LeftBrace, "", line, pos)
} // open brace
'}' -> {
getNextChar()
Token(TokenType.Sy_RightBrace, "", line, pos)
} // close brace
'(' -> {
getNextChar()
Token(TokenType.Sy_LeftParen, "", line, pos)
} // open paren
')' -> {
getNextChar()
Token(TokenType.Sy_RightParen, "", line, pos)
} // close paren
'+' -> {
getNextChar()
Token(TokenType.Op_add, "", line, pos)
} // plus
'-' -> {
getNextChar()
Token(TokenType.Op_subtract, "", line, pos)
} // dash
'*' -> {
getNextChar()
Token(TokenType.Op_multiply, "", line, pos)
} // asterisk
'%' -> {
getNextChar()
Token(TokenType.Op_mod, "", line, pos)
} // percent
';' -> {
getNextChar()
Token(TokenType.Sy_Semicolon, "", line, pos)
} // semicolon
',' -> {
getNextChar()
Token(TokenType.Sy_Comma, "", line, pos)
} // comma
else ->
identifier_or_integer(line, pos) }
} // getToken
 
// Method to parse and display tokens.
fun printTokens() {
do {
val t: Token = getToken()
println(t)
} while (t.token != TokenType.Tk_End_of_input)
} // printTokens
} // Lexer
 
 
// Function to test all good tests from the website and produce all of the
// error messages this program supports.
fun tests(number: Int) {
 
// Function to generate test case 0 source: Hello World/Text.
fun hello() {
Lexer(
"""/*
Hello world
*/
print("Hello, World!\n");
""").printTokens()
} // hello
 
// Function to generate test case 1 source: Phoenix Number.
fun phoenix() {
Lexer(
"""/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, "\n");""").printTokens()
} // phoenix
 
// Function to generate test case 2 source: All Symbols.
fun symbols() {
Lexer(
"""/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ <
/* If */ if /* Gtr */ >
/* Else */ else /* Leq */ <=
/* While */ while /* Geq */ >=
/* Lbrace */ { /* Eq */ ==
/* Rbrace */ } /* Neq */ !=
/* Lparen */ ( /* And */ &&
/* Rparen */ ) /* Or */ ||
/* Uminus */ - /* Semi */ ;
/* Not */ ! /* Comma */ ,
/* Mul */ * /* Assign */ =
/* Div */ / /* Integer */ 42
/* Mod */ % /* String */ "String literal"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\n'
/* character literal */ '\\'
/* character literal */ ' '""").printTokens()
} // symbols
 
// Function to generate test case 3 source: Test Case 4.
fun four() {
Lexer(
"""/*** test printing, embedded \n and comments with lots of '*' ***/
print(42);
print("\nHello World\nGood Bye\nok\n");
print("Print a slash n - \\n.\n");""").printTokens()
} // four
 
// Function to generate test case 4 source: Count.
fun count() {
Lexer(
"""count = 1;
while (count < 10) {
print("count is: ", count, "\n");
count = count + 1;
}""").printTokens()
} // count
 
// Function to generate test case 5 source: 100 Doors.
fun doors() {
Lexer(
"""/* 100 Doors */
i = 1;
while (i * i <= 100) {
print("door ", i * i, " is open\n");
i = i + 1;
}""").printTokens()
} // doors
 
// Function to generate test case 6 source: Negative Tests.
fun negative() {
Lexer(
"""a = (-1 * ((-1 * (5 * 15)) / 10));
print(a, "\n");
b = -a;
print(b, "\n");
print(-b, "\n");
print(-(1), "\n");""").printTokens()
} // negative
 
// Function to generate test case 7 source: Deep.
fun deep() {
Lexer(
"""print(---------------------------------+++5, "\n");
print(((((((((3 + 2) * ((((((2))))))))))))), "\n");
 
if (1) { if (1) { if (1) { if (1) { if (1) { print(15, "\n"); } } } } }""").printTokens()
} // deep
 
// Function to generate test case 8 source: Greatest Common Divisor.
fun gcd() {
Lexer(
"""/* Compute the gcd of 1071, 1029: 21 */
 
a = 1071;
b = 1029;
 
while (b != 0) {
new_a = b;
b = a % b;
a = new_a;
}
print(a);""").printTokens()
} // gcd
 
// Function to generate test case 9 source: Factorial.
fun factorial() {
Lexer(
"""/* 12 factorial is 479001600 */
 
n = 12;
result = 1;
i = 1;
while (i <= n) {
result = result * i;
i = i + 1;
}
print(result);""").printTokens()
} // factorial
 
// Function to generate test case 10 source: Fibonacci Sequence.
fun fibonacci() {
Lexer(
"""/* fibonacci of 44 is 701408733 */
 
n = 44;
i = 1;
a = 0;
b = 1;
while (i < n) {
w = a + b;
a = b;
b = w;
i = i + 1;
}
print(w, "\n");""").printTokens()
} // fibonacci
 
// Function to generate test case 11 source: FizzBuzz.
fun fizzbuzz() {
Lexer(
"""/* FizzBuzz */
i = 1;
while (i <= 100) {
if (!(i % 15))
print("FizzBuzz");
else if (!(i % 3))
print("Fizz");
else if (!(i % 5))
print("Buzz");
else
print(i);
 
print("\n");
i = i + 1;
}""").printTokens()
} // fizzbuzz
 
// Function to generate test case 12 source: 99 Bottles of Beer.
fun bottles() {
Lexer(
"""/* 99 bottles */
bottles = 99;
while (bottles > 0) {
print(bottles, " bottles of beer on the wall\n");
print(bottles, " bottles of beer\n");
print("Take one down, pass it around\n");
bottles = bottles - 1;
print(bottles, " bottles of beer on the wall\n\n");
}""").printTokens()
} // bottles
 
// Function to generate test case 13 source: Primes.
fun primes() {
Lexer(
"""/*
Simple prime number generator
*/
count = 1;
n = 1;
limit = 100;
while (n < limit) {
k=3;
p=1;
n=n+2;
while ((k*k<=n) && (p)) {
p=n/k*k!=n;
k=k+2;
}
if (p) {
print(n, " is prime\n");
count = count + 1;
}
}
print("Total primes found: ", count, "\n");""").printTokens()
} // primes
 
// Function to generate test case 14 source: Ascii Mandelbrot.
fun ascii() {
Lexer(
"""{
/*
This is an integer ascii Mandelbrot generator
*/
left_edge = -420;
right_edge = 300;
top_edge = 300;
bottom_edge = -300;
x_step = 7;
y_step = 15;
 
max_iter = 200;
 
y0 = top_edge;
while (y0 > bottom_edge) {
x0 = left_edge;
while (x0 < right_edge) {
y = 0;
x = 0;
the_char = ' ';
i = 0;
while (i < max_iter) {
x_x = (x * x) / 200;
y_y = (y * y) / 200;
if (x_x + y_y > 800 ) {
the_char = '0' + i;
if (i > 9) {
the_char = '@';
}
i = max_iter;
}
y = x * y / 100 + y0;
x = x_x - y_y + x0;
i = i + 1;
}
putc(the_char);
x0 = x0 + x_step;
}
putc('\n');
y0 = y0 - y_step;
}
}
""").printTokens()
} // ascii
 
when (number) {
0 ->
hello()
1 ->
phoenix()
2 ->
symbols()
3 ->
four()
4 ->
count()
5 ->
doors()
6 ->
negative()
7 ->
deep()
8 ->
gcd()
9 ->
factorial()
10 ->
fibonacci()
11 ->
fizzbuzz()
12 ->
bottles()
13 ->
primes()
14 ->
ascii()
15 -> // Lexer: Empty character constant
Lexer("''").printTokens()
16 -> // Lexer: Unknown escape sequence
Lexer("'\\x").printTokens()
17 -> // Lexer: Multi-character constant
Lexer("' ").printTokens()
18 -> // Lexer: EOF in comment
Lexer("/*").printTokens()
19 -> // Lexer: EOL in string
Lexer("\"\n").printTokens()
20 -> // Lexer: EOF in string
Lexer("\"").printTokens()
21 -> // Lexer: Escape sequence unknown
Lexer("\"\\x").printTokens()
22 -> // Lexer: Unrecognized character
Lexer("~").printTokens()
23 -> // Lexer: invalid number
Lexer("9a9").printTokens()
24 -> // Lexer: Number exceeds maximum value
Lexer("2147483648\n9223372036854775808").printTokens()
25 -> // Lexer: Operator expected
Lexer("|.").printTokens()
else ->
println("Invalid test number %d!".format(number))
} // when
} // tests
 
// Main function to check our source and read its data before parsing it.
// With no source specified, run the test of all symbols.
fun main(args: Array<String>) {
try {
val s =
if (args.size > 0 && args[0].isNotEmpty()) // file on command line
java.util.Scanner(java.io.File(args[0]))
else // use the console
java.util.Scanner(System.`in`)
var source = ""
while (s.hasNext())
source += s.nextLine()+
if (s.hasNext())
"\n"
else
""
if (args.size > 0 && args[0].isNotEmpty()) // file on command line
Lexer(source).printTokens()
else {
val digits = source.filter { it in '0'..'9' }
when {
source.isEmpty() -> // nothing given
tests(2)
source.length in 1..2 && digits.length == source.length &&
digits.toInt() in 0..25 ->
tests(digits.toInt())
else ->
Lexer(source).printTokens()
} // when
} // if
} catch(e: Throwable) {
println(e.message)
System.exit(1)
} // try
} // main</syntaxhighlight>
{{out|case=test case 3: All Symbols}}
<b>
<pre>
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
22 29 End_of_input
</pre>
</b>
 
=={{header|Lua}}==
Line 8,227 ⟶ 10,614:
 
The first module is simply a table defining the names of tokens which don't have an associated value.
<langsyntaxhighlight Lualang="lua">-- module token_name (in a file "token_name.lua")
local token_name = {
['*'] = 'Op_multiply',
Line 8,256 ⟶ 10,643:
['putc'] = 'Keyword_putc',
}
return token_name</langsyntaxhighlight>
 
This module exports a function <i>find_token</i>, which attempts to find the next valid token from a specified position in a source line.
<langsyntaxhighlight Lualang="lua">-- module lpeg_token_finder
local M = {} -- only items added to M will be public (via 'return M' at end)
local table, concat = table, table.concat
Line 8,342 ⟶ 10,729:
end
return M</langsyntaxhighlight>
 
The <i>lexer</i> module uses <i>finder.find_token</i> to produce an iterator over the tokens in a source.
<langsyntaxhighlight Lualang="lua">-- module lexer
local M = {} -- only items added to M will publicly available (via 'return M' at end)
local string, io, coroutine, yield = string, io, coroutine, coroutine.yield
Line 8,424 ⟶ 10,811:
-- M._INTERNALS = _ENV
return M
</syntaxhighlight>
</lang>
 
This script uses <i>lexer.tokenize_text</i> to show the token sequence produced from a source text.
 
<langsyntaxhighlight Lualang="lua">lexer = require 'lexer'
format, gsub = string.format, string.gsub
 
Line 8,466 ⟶ 10,853:
-- etc.
end
</syntaxhighlight>
</lang>
 
===Using only standard libraries===
This version replaces the <i>lpeg_token_finder</i> module of the LPeg version with this <i>basic_token_finder</i> module, altering the <i>require</i> expression near the top of the <i>lexer</i> module accordingly. Tested with Lua 5.3.5. (Note that <i>select</i> is a standard function as of Lua 5.2.)
 
<langsyntaxhighlight lang="lua">-- module basic_token_finder
local M = {} -- only items added to M will be public (via 'return M' at end)
local table, string = table, string
Line 8,601 ⟶ 10,988:
 
-- M._ENV = _ENV
return M</langsyntaxhighlight>
 
=={{header|M2000 Interpreter}}==
<syntaxhighlight lang="m2000 interpreter">
<lang M2000 Interpreter>
Module lexical_analyzer {
a$={/*
Line 8,860 ⟶ 11,247:
}
lexical_analyzer
</syntaxhighlight>
</lang>
 
{{out}}
Line 8,899 ⟶ 11,286:
 
</pre >
 
=={{header|Mercury}}==
{{trans|ATS}}
{{works with|Mercury|20.06.1}}
 
 
<syntaxhighlight lang="mercury">% -*- mercury -*-
%
% Compile with maybe something like:
% mmc -O4 --intermod-opt -E --make --warn-non-tail-recursion self-and-mutual lex
%
 
:- module lex.
 
:- interface.
:- import_module io.
:- pred main(io::di, io::uo) is det.
 
:- implementation.
:- import_module char.
:- import_module exception.
:- import_module int.
:- import_module list.
:- import_module stack.
:- import_module string.
 
:- type token_t
---> token_ELSE
; token_IF
; token_PRINT
; token_PUTC
; token_WHILE
; token_MULTIPLY
; token_DIVIDE
; token_MOD
; token_ADD
; token_SUBTRACT
; token_NEGATE
; token_LESS
; token_LESSEQUAL
; token_GREATER
; token_GREATEREQUAL
; token_EQUAL
; token_NOTEQUAL
; token_NOT
; token_ASSIGN
; token_AND
; token_OR
; token_LEFTPAREN
; token_RIGHTPAREN
; token_LEFTBRACE
; token_RIGHTBRACE
; token_SEMICOLON
; token_COMMA
; token_IDENTIFIER
; token_INTEGER
; token_STRING
; token_END_OF_INPUT.
 
:- type ch_t % The type of a fetched character.
---> {int, % A character or `eof', stored as an int.
int, % The line number.
int}. % The column number.
 
:- type inp_t % The `inputter' type. Fetches one character.
---> inp_t(inpf :: text_input_stream,
line_no :: int,
column_no :: int,
pushback :: stack(ch_t)).
 
:- type toktup_t % The type of a scanned token with its argument.
---> {token_t, % The token kind.
string, % An argument. (May or may not be meaningful.)
int, % The starting line number.
int}. % The starting column number.
 
main(!IO) :-
command_line_arguments(Args, !IO),
(
if (Args = [])
then (InpF_filename = "-",
OutF_filename = "-",
main_program(InpF_filename, OutF_filename, !IO))
else if (Args = [F1])
then (InpF_filename = F1,
OutF_filename = "-",
main_program(InpF_filename, OutF_filename, !IO))
else if (Args = [F1, F2])
then (InpF_filename = F1,
OutF_filename = F2,
main_program(InpF_filename, OutF_filename, !IO))
else usage_error(!IO)
).
 
:- pred main_program(string::in, string::in, io::di, io::uo) is det.
main_program(InpF_filename, OutF_filename, !IO) :-
open_InpF(InpF, InpF_filename, !IO),
open_OutF(OutF, OutF_filename, !IO),
init(InpF, Inp0),
scan_text(OutF, Inp0, _, !IO).
 
:- pred open_InpF(text_input_stream::out, string::in,
io::di, io::uo) is det.
open_InpF(InpF, InpF_filename, !IO) :-
if (InpF_filename = "-")
then (InpF = io.stdin_stream)
else
(
open_input(InpF_filename, InpF_result, !IO),
(
if (InpF_result = ok(F))
then (InpF = F)
else throw("Error: cannot open " ++ InpF_filename ++
" for input")
)
).
 
:- pred open_OutF(text_output_stream::out, string::in,
io::di, io::uo) is det.
open_OutF(OutF, OutF_filename, !IO) :-
if (OutF_filename = "-")
then (OutF = io.stdout_stream)
else
(
open_output(OutF_filename, OutF_result, !IO),
(
if (OutF_result = ok(F))
then (OutF = F)
else throw("Error: cannot open " ++ OutF_filename ++
" for output")
)
).
 
:- pred usage_error(io::di, io::uo) is det.
usage_error(!IO) :-
progname("lex", ProgName, !IO),
(io.format("Usage: %s [INPUT_FILE [OUTPUT_FILE]]\n",
[s(ProgName)], !IO)),
(io.write_string("If INPUT_FILE is \"-\" or not present then standard input is used.\n",
!IO)),
(io.write_string("If OUTPUT_FILE is \"-\" or not present then standard output is used.\n",
!IO)),
set_exit_status(1, !IO).
 
:- pred scan_text(text_output_stream::in, inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_text(OutF, !Inp, !IO) :-
get_next_token(TokTup, !Inp, !IO),
print_token(TokTup, OutF, !IO),
{Tok, _, _, _} = TokTup,
(
if (Tok = token_END_OF_INPUT)
then true
else scan_text(OutF, !Inp, !IO)
).
 
:- pred print_token(toktup_t::in, text_output_stream::in,
io::di, io::uo) is det.
print_token(TokTup, OutF, !IO) :-
{Tok, Arg, Line_no, Column_no} = TokTup,
token_name(Tok) = TokName,
(io.format(OutF, "%5d %5d %s",
[i(Line_no), i(Column_no), s(TokName)],
!IO)),
(
if (Tok = token_IDENTIFIER)
then (io.format(OutF, " %s", [s(Arg)], !IO))
else if (Tok = token_INTEGER)
then (io.format(OutF, " %s", [s(Arg)], !IO))
else if (Tok = token_STRING)
then (io.format(OutF, " %s", [s(Arg)], !IO))
else true
),
(io.format(OutF, "\n", [], !IO)).
 
:- func token_name(token_t) = string is det.
:- pred token_name(token_t::in, string::out) is det.
token_name(Tok) = Str :- token_name(Tok, Str).
token_name(token_ELSE, "Keyword_else").
token_name(token_IF, "Keyword_if").
token_name(token_PRINT, "Keyword_print").
token_name(token_PUTC, "Keyword_putc").
token_name(token_WHILE, "Keyword_while").
token_name(token_MULTIPLY, "Op_multiply").
token_name(token_DIVIDE, "Op_divide").
token_name(token_MOD, "Op_mod").
token_name(token_ADD, "Op_add").
token_name(token_SUBTRACT, "Op_subtract").
token_name(token_NEGATE, "Op_negate").
token_name(token_LESS, "Op_less").
token_name(token_LESSEQUAL, "Op_lessequal").
token_name(token_GREATER, "Op_greater").
token_name(token_GREATEREQUAL, "Op_greaterequal").
token_name(token_EQUAL, "Op_equal").
token_name(token_NOTEQUAL, "Op_notequal").
token_name(token_NOT, "Op_not").
token_name(token_ASSIGN, "Op_assign").
token_name(token_AND, "Op_and").
token_name(token_OR, "Op_or").
token_name(token_LEFTPAREN, "LeftParen").
token_name(token_RIGHTPAREN, "RightParen").
token_name(token_LEFTBRACE, "LeftBrace").
token_name(token_RIGHTBRACE, "RightBrace").
token_name(token_SEMICOLON, "Semicolon").
token_name(token_COMMA, "Comma").
token_name(token_IDENTIFIER, "Identifier").
token_name(token_INTEGER, "Integer").
token_name(token_STRING, "String").
token_name(token_END_OF_INPUT, "End_of_input").
 
:- pred get_next_token(toktup_t::out, inp_t::in, inp_t::out,
io::di, io::uo) is det.
get_next_token(TokTup, !Inp, !IO) :-
skip_spaces_and_comments(!Inp, !IO),
get_ch(Ch, !Inp, !IO),
{IChar, Line_no, Column_no} = Ch,
LN = Line_no,
CN = Column_no,
(
if (IChar = eof)
then
(
TokTup = {token_END_OF_INPUT, "", LN, CN}
)
else
(
Char = det_from_int(IChar),
(
if (Char = (','))
then (TokTup = {token_COMMA, ",", LN, CN})
else if (Char = (';'))
then (TokTup = {token_SEMICOLON, ";", LN, CN})
else if (Char = ('('))
then (TokTup = {token_LEFTPAREN, "(", LN, CN})
else if (Char = (')'))
then (TokTup = {token_RIGHTPAREN, ")", LN, CN})
else if (Char = ('{'))
then (TokTup = {token_LEFTBRACE, "{", LN, CN})
else if (Char = ('}'))
then (TokTup = {token_RIGHTBRACE, "}", LN, CN})
else if (Char = ('*'))
then (TokTup = {token_MULTIPLY, "*", LN, CN})
else if (Char = ('/'))
then (TokTup = {token_DIVIDE, "/", LN, CN})
else if (Char = ('%'))
then (TokTup = {token_MOD, "%", LN, CN})
else if (Char = ('+'))
then (TokTup = {token_ADD, "+", LN, CN})
else if (Char = ('-'))
then (TokTup = {token_SUBTRACT, "-", LN, CN})
else if (Char = ('<'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('='))
then
(
TokTup = {token_LESSEQUAL, "<=", LN, CN}
)
else
(
push_back(Ch1, !Inp),
TokTup = {token_LESS, "<", LN, CN}
)
)
)
else if (Char = ('>'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('='))
then
(
TokTup = {token_GREATEREQUAL, ">=", LN, CN}
)
else
(
push_back(Ch1, !Inp),
TokTup = {token_GREATER, ">", LN, CN}
)
)
)
else if (Char = ('='))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('='))
then
(
TokTup = {token_EQUAL, "==", LN, CN}
)
else
(
push_back(Ch1, !Inp),
TokTup = {token_ASSIGN, "=", LN, CN}
)
)
)
else if (Char = ('!'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('='))
then
(
TokTup = {token_NOTEQUAL, "!=", LN, CN}
)
else
(
push_back(Ch1, !Inp),
TokTup = {token_NOT, "!", LN, CN}
)
)
)
else if (Char = ('&'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('&'))
then
(
TokTup = {token_AND, "&&", LN, CN}
)
else throw("Error: unexpected character '" ++
from_char(Char) ++ "' at " ++
from_int(LN) ++ ":" ++
from_int(CN))
)
)
else if (Char = ('|'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = to_int('|'))
then
(
TokTup = {token_OR, "||", LN, CN}
)
else throw("Error: unexpected character '" ++
from_char(Char) ++ "' at " ++
from_int(LN) ++ ":" ++
from_int(CN))
)
)
else if (Char = ('"'))
then
(
push_back(Ch, !Inp),
scan_string_literal(TokTup, !Inp, !IO)
)
else if (Char = ('\''))
then
(
push_back(Ch, !Inp),
scan_character_literal(TokTup, !Inp, !IO)
)
else if (is_alpha(Char))
then
(
push_back(Ch, !Inp),
scan_identifier_or_reserved_word(
TokTup, !Inp, !IO)
)
else if (is_digit(Char))
then
(
push_back(Ch, !Inp),
scan_integer_literal(TokTup, !Inp, !IO)
)
else
(
throw("Error: unexpected character '" ++
from_char(Char) ++ "' at " ++
from_int(LN) ++ ":" ++
from_int(CN))
)
)
)
).
 
 
:- pred skip_spaces_and_comments(inp_t::in, inp_t::out,
io::di, io::uo) is det.
skip_spaces_and_comments(!Inp, !IO) :-
get_ch(Ch, !Inp, !IO),
Ch = {IChar, _, _},
(
if (IChar = eof)
then push_back(Ch, !Inp)
else
if (is_whitespace(det_from_int(IChar)))
then skip_spaces_and_comments(!Inp, !IO)
else if (IChar = to_int('/'))
then
(
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, Line_no, Column_no},
(
if (IChar1 = to_int('*'))
then
(
scan_comment(Line_no, Column_no,
!Inp, !IO),
skip_spaces_and_comments(!Inp, !IO)
)
else
(
push_back(Ch1, !Inp),
push_back(Ch, !Inp)
)
)
)
else push_back(Ch, !Inp)
).
 
:- pred scan_comment(int::in, int::in, % line and column nos.
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_comment(Line_no, Column_no, !Inp, !IO) :-
get_ch(Ch, !Inp, !IO),
{IChar, _, _} = Ch,
(
if (IChar = eof)
then throw("Error: unterminated comment " ++
"starting at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else
(
det_from_int(IChar) = Char,
(
if (Char = ('*'))
then
(
get_ch(Ch1, !Inp, !IO),
{IChar1, _, _} = Ch1,
(
if (IChar1 = to_int('/'))
then true % End of comment has been reached.
else
(
push_back(Ch1, !Inp),
scan_comment(Line_no, Column_no, !Inp,
!IO)
)
)
)
else scan_comment(Line_no, Column_no, !Inp, !IO)
)
)
).
 
:- pred scan_character_literal(toktup_t::out,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_character_literal(TokTup, !Inp, !IO) :-
get_ch(Ch, !Inp, !IO),
Ch = {OpenQuote, Line_no, Column_no},
CloseQuote = OpenQuote,
scan_char_lit_contents(CodePoint, Line_no, Column_no,
!Inp, !IO),
check_char_lit_end(CloseQuote, Line_no, Column_no, !Inp, !IO),
Arg = from_int(CodePoint),
TokTup = {token_INTEGER, Arg, Line_no, Column_no}.
 
:- pred scan_char_lit_contents(int::out, int::in, int::in,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_char_lit_contents(CodePoint, Line_no, Column_no,
!Inp, !IO) :-
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, Line_no1, Column_no1},
(
if (IChar1 = eof)
then throw("Error: end of input in character literal " ++
"starting at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else if (IChar1 = to_int('\\'))
then
(
get_ch(Ch2, !Inp, !IO),
Ch2 = {IChar2, _, _},
(if (IChar2 = eof)
then throw("Error: end of input in character literal " ++
"starting at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else if (IChar2 = to_int('n'))
then (CodePoint = to_int('\n'))
else if (IChar2 = to_int('\\'))
then (CodePoint = to_int('\\'))
else throw("Error: unsupported escape \\" ++
from_char(det_from_int(IChar2)) ++
" at " ++ from_int(Line_no1) ++
":" ++ from_int(Column_no1))
)
)
else (CodePoint = IChar1)
).
 
:- pred check_char_lit_end(int::in, int::in, int::in,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
check_char_lit_end(CloseQuote, Line_no, Column_no, !Inp, !IO) :-
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, _, _},
(
if (IChar1 = CloseQuote)
then true
else find_bad_char_lit_end(CloseQuote, Line_no, Column_no,
!Inp, !IO)
).
 
:- pred find_bad_char_lit_end(int::in, int::in, int::in,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
find_bad_char_lit_end(CloseQuote, Line_no, Column_no, !Inp, !IO) :-
get_ch(Ch2, !Inp, !IO),
Ch2 = {IChar2, _, _},
(
if (IChar2 = CloseQuote)
then throw("Error: unsupported multicharacter literal " ++
" at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else if (IChar2 = eof)
then throw("Error: end of input in character literal " ++
" at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else find_bad_char_lit_end(CloseQuote, Line_no, Column_no,
!Inp, !IO)
).
 
:- pred scan_string_literal(toktup_t::out,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_string_literal(TokTup, !Inp, !IO) :-
get_ch(Ch, !Inp, !IO),
Ch = {OpenQuote, Line_no, Column_no},
CloseQuote = OpenQuote,
scan_string_lit_contents("", Str, CloseQuote,
Line_no, Column_no,
!Inp, !IO),
Arg = from_char(det_from_int(OpenQuote)) ++
Str ++ from_char(det_from_int(CloseQuote)),
TokTup = {token_STRING, Arg, Line_no, Column_no}.
 
:- pred scan_string_lit_contents(string::in, string::out, int::in,
int::in, int::in,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_string_lit_contents(Str0, Str, CloseQuote, Line_no, Column_no,
!Inp, !IO) :-
get_ch(Ch1, !Inp, !IO),
Ch1 = {IChar1, Line_no1, Column_no1},
(
if (IChar1 = CloseQuote)
then (Str = Str0)
else if (IChar1 = eof)
then throw("Error: end of input in string literal " ++
"starting at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else if (IChar1 = to_int('\n'))
then throw("Error: end of line in string literal " ++
"starting at " ++ from_int(Line_no) ++ ":" ++
from_int(Column_no))
else if (IChar1 = to_int('\\'))
then
(
get_ch(Ch2, !Inp, !IO),
Ch2 = {IChar2, _, _},
(
if (IChar2 = to_int('n'))
then
(
Str1 = Str0 ++ "\\n",
scan_string_lit_contents(Str1, Str, CloseQuote,
Line_no, Column_no,
!Inp, !IO)
)
else if (IChar2 = to_int('\\'))
then
(
Str1 = Str0 ++ "\\\\",
scan_string_lit_contents(Str1, Str, CloseQuote,
Line_no, Column_no,
!Inp, !IO)
)
else if (IChar2 = eof)
then throw("Error: end of input in string literal " ++
"starting at " ++ from_int(Line_no) ++
":" ++ from_int(Column_no))
else if (IChar2 = to_int('\n'))
then throw("Error: end of line in string literal " ++
"starting at " ++ from_int(Line_no) ++
":" ++ from_int(Column_no))
else throw("Error: unsupported escape \\" ++
from_char(det_from_int(IChar2)) ++
" at " ++ from_int(Line_no1) ++
":" ++ from_int(Column_no1))
)
)
else
(
Char1 = det_from_int(IChar1),
Str1 = Str0 ++ from_char(Char1),
scan_string_lit_contents(Str1, Str, CloseQuote,
Line_no, Column_no, !Inp, !IO)
)
).
 
:- pred scan_identifier_or_reserved_word(toktup_t::out,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_identifier_or_reserved_word(TokTup, !Inp, !IO) :-
scan_integer_or_word(Str, Line_no, Column_no, !Inp, !IO),
(
if (Str = "if")
then (TokTup = {token_IF, Str, Line_no, Column_no})
else if (Str = "else")
then (TokTup = {token_ELSE, Str, Line_no, Column_no})
else if (Str = "while")
then (TokTup = {token_WHILE, Str, Line_no, Column_no})
else if (Str = "print")
then (TokTup = {token_PRINT, Str, Line_no, Column_no})
else if (Str = "putc")
then (TokTup = {token_PUTC, Str, Line_no, Column_no})
else (TokTup = {token_IDENTIFIER, Str, Line_no, Column_no})
).
 
:- pred scan_integer_literal(toktup_t::out, inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_integer_literal(TokTup, !Inp, !IO) :-
scan_integer_or_word(Str, Line_no, Column_no, !Inp, !IO),
(
if (not is_all_digits(Str))
then throw("Error: not a valid integer literal: " ++ Str)
else (TokTup = {token_INTEGER, Str, Line_no, Column_no})
).
 
:- pred scan_integer_or_word(string::out, int::out, int::out,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_integer_or_word(Str, Line_no, Column_no, !Inp, !IO) :-
get_ch({IChar, Line_no, Column_no}, !Inp, !IO),
(
if (IChar = eof)
then throw("internal error")
else
(
Char = det_from_int(IChar),
(if (not is_alnum_or_underscore(Char))
then throw("internal error")
else scan_int_or_word(from_char(Char), Str, !Inp, !IO))
)
).
 
:- pred scan_int_or_word(string::in, string::out,
inp_t::in, inp_t::out,
io::di, io::uo) is det.
scan_int_or_word(Str0, Str, !Inp, !IO) :-
get_ch(CharTup, !Inp, !IO),
{IChar, _, _} = CharTup,
(
if (IChar = eof)
then
(
push_back(CharTup, !Inp),
Str = Str0
)
else
(
Char = det_from_int(IChar),
(
if (not is_alnum_or_underscore(Char))
then
(
push_back(CharTup, !Inp),
Str = Str0
)
else scan_int_or_word(Str0 ++ from_char(Char), Str,
!Inp, !IO)
)
)
).
 
:- pred init(text_input_stream::in, inp_t::out) is det.
init(Inpf, Inp) :-
Inp = inp_t(Inpf, 1, 1, init).
 
 
:- pred get_ch(ch_t::out, inp_t::in, inp_t::out,
io::di, io::uo) is det.
get_ch(Ch, Inp0, Inp, !IO) :-
if (pop(Ch1, Inp0^pushback, Pushback))
then
(
Ch = Ch1,
Inp = (Inp0^pushback := Pushback)
)
else
(
inp_t(Inpf, Line_no, Column_no, Pushback) = Inp0,
read_char_unboxed(Inpf, Result, Char, !IO),
(
if (Result = ok)
then
(
Ch = {to_int(Char), Line_no, Column_no},
Inp =
(if (Char = ('\n'))
then inp_t(Inpf, Line_no + 1, 1, Pushback)
else inp_t(Inpf, Line_no, Column_no + 1, Pushback))
)
else
(
Ch = {eof, Line_no, Column_no},
Inp = Inp0
)
)
).
 
:- pred push_back(ch_t::in, inp_t::in, inp_t::out) is det.
push_back(Ch, Inp0, Inp) :-
Inp = (Inp0^pushback := push(Inp0^pushback, Ch)).
 
:- func eof = int is det.
eof = -1.</syntaxhighlight>
 
{{out}}
<pre>$ mmc -O6 --intermod-opt -E --make --warn-non-tail-recursion self-and-mutual lex && ./lex compiler-tests/testcase3.t
Making Mercury/int3s/lex.int3
Making Mercury/ints/lex.int
Making Mercury/opts/lex.opt
Making Mercury/cs/lex.c
Making Mercury/os/lex.o
Making lex
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
 
=={{header|Nim}}==
Tested with Nim v0.19.4. Both examples are tested against all programs in [[Compiler/Sample programs]].
===Using string with regular expressions===
<langsyntaxhighlight lang="nim">
import re, strformat, strutils
 
Line 9,095 ⟶ 12,263:
 
echo input.tokenize.output
</syntaxhighlight>
</lang>
===Using stream with lexer library===
<langsyntaxhighlight lang="nim">
import lexbase, streams
from strutils import Whitespace
Line 9,408 ⟶ 12,576:
echo &"({l.lineNumber},{l.getColNumber l.bufpos + 1}) {l.error}"
main()
</syntaxhighlight>
</lang>
 
===Using nothing but system and strutils===
<langsyntaxhighlight lang="nim">import strutils
 
type
Line 9,631 ⟶ 12,799:
stdout.write('\n')
if token.kind == tokEnd:
break</langsyntaxhighlight>
 
=={{header|ObjectIcon}}==
{{trans|Icon}}
{{trans|ATS}}
 
 
There are very few changes from the ordinary Icon version: I/O is modified to use FileStreams; and the '''max''' procedure is removed, because there is an Object Icon builtin procedure.
 
 
<syntaxhighlight lang="objecticon"># -*- ObjectIcon -*-
#
# The Rosetta Code lexical analyzer in Object Icon. Based upon the ATS
# implementation.
#
# Usage: lex [INPUTFILE [OUTPUTFILE]]
# If INPUTFILE or OUTPUTFILE is "-" or missing, then standard input
# or standard output is used, respectively. *)
#
 
import io
 
$define EOF -1
 
$define TOKEN_ELSE 0
$define TOKEN_IF 1
$define TOKEN_PRINT 2
$define TOKEN_PUTC 3
$define TOKEN_WHILE 4
$define TOKEN_MULTIPLY 5
$define TOKEN_DIVIDE 6
$define TOKEN_MOD 7
$define TOKEN_ADD 8
$define TOKEN_SUBTRACT 9
$define TOKEN_NEGATE 10
$define TOKEN_LESS 11
$define TOKEN_LESSEQUAL 12
$define TOKEN_GREATER 13
$define TOKEN_GREATEREQUAL 14
$define TOKEN_EQUAL 15
$define TOKEN_NOTEQUAL 16
$define TOKEN_NOT 17
$define TOKEN_ASSIGN 18
$define TOKEN_AND 19
$define TOKEN_OR 20
$define TOKEN_LEFTPAREN 21
$define TOKEN_RIGHTPAREN 22
$define TOKEN_LEFTBRACE 23
$define TOKEN_RIGHTBRACE 24
$define TOKEN_SEMICOLON 25
$define TOKEN_COMMA 26
$define TOKEN_IDENTIFIER 27
$define TOKEN_INTEGER 28
$define TOKEN_STRING 29
$define TOKEN_END_OF_INPUT 30
 
global whitespace
global ident_start
global ident_continuation
 
procedure main(args)
local inpf, outf
local pushback_buffer, inp, pushback
 
initial {
whitespace := ' \t\v\f\r\n'
ident_start := '_' ++ &letters
ident_continuation := ident_start ++ &digits
}
 
inpf := FileStream.stdin
outf := FileStream.stdout
if 1 <= *args & args[1] ~== "-" then {
inpf := FileStream(args[1], FileOpt.RDONLY) | stop(&why)
}
if 2 <= *args & args[2] ~== "-" then {
outf := FileStream(args[2], ior(FileOpt.WRONLY,
FileOpt.TRUNC,
FileOpt.CREAT)) | stop(&why)
}
 
pushback_buffer := []
inp := create inputter(inpf, pushback_buffer)
pushback := create repeat push(pushback_buffer, \@&source)
@pushback # The first invocation does nothing.
 
scan_text(outf, inp, pushback)
end
 
procedure scan_text(outf, inp, pushback)
local ch
 
while /ch | ch[1] ~=== EOF do {
skip_spaces_and_comments(inp, pushback)
ch := @inp
if ch[1] === EOF then {
print_token(outf, [TOKEN_END_OF_INPUT, "", ch[2], ch[3]])
} else {
ch @pushback
print_token(outf, get_next_token(inp, pushback))
}
}
end
 
procedure get_next_token(inp, pushback)
local ch, ch1
local ln, cn
 
skip_spaces_and_comments(inp, pushback)
ch := @inp
ln := ch[2] # line number
cn := ch[3] # column number
case ch[1] of {
"," : return [TOKEN_COMMA, ",", ln, cn]
";" : return [TOKEN_SEMICOLON, ";", ln, cn]
"(" : return [TOKEN_LEFTPAREN, "(", ln, cn]
")" : return [TOKEN_RIGHTPAREN, ")", ln, cn]
"{" : return [TOKEN_LEFTBRACE, "{", ln, cn]
"}" : return [TOKEN_RIGHTBRACE, "}", ln, cn]
"*" : return [TOKEN_MULTIPLY, "*", ln, cn]
"/" : return [TOKEN_DIVIDE, "/", ln, cn]
"%" : return [TOKEN_MOD, "%", ln, cn]
"+" : return [TOKEN_ADD, "+", ln, cn]
"-" : return [TOKEN_SUBTRACT, "-", ln, cn]
"<" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_LESSEQUAL, "<=", ln, cn]
} else {
ch1 @pushback
return [TOKEN_LESS, "<", ln, cn]
}
}
">" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_GREATEREQUAL, ">=", ln, cn]
} else {
ch1 @pushback
return [TOKEN_GREATER, ">", ln, cn]
}
}
"=" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_EQUAL, "==", ln, cn]
} else {
ch1 @pushback
return [TOKEN_ASSIGN, "=", ln, cn]
}
}
"!" : {
ch1 := @inp
if ch1[1] === "=" then {
return [TOKEN_NOTEQUAL, "!=", ln, cn]
} else {
ch1 @pushback
return [TOKEN_NOT, "!", ln, cn]
}
}
"&" : {
ch1 := @inp
if ch1[1] === "&" then {
return [TOKEN_AND, "&&", ln, cn]
} else {
unexpected_character(ln, cn, ch)
}
}
"|" : {
ch1 := @inp
if ch1[1] === "|" then {
return [TOKEN_OR, "||", ln, cn]
} else {
unexpected_character(ln, cn, ch)
}
}
"\"" : {
ch @pushback
return scan_string_literal(inp)
}
"'" : {
ch @pushback
return scan_character_literal(inp, pushback)
}
default : {
if any(&digits, ch[1]) then {
ch @pushback
return scan_integer_literal(inp, pushback)
} else if any(ident_start, ch[1]) then {
ch @pushback
return scan_identifier_or_reserved_word (inp, pushback)
} else {
unexpected_character(ln, cn, ch)
}
}
}
end
 
procedure scan_identifier_or_reserved_word(inp, pushback)
local ch
local s
local line_no, column_no
 
s := ""
ch := @inp
line_no := ch[2]
column_no := ch[3]
while EOF ~=== ch[1] & any(ident_continuation, ch[1]) do {
s ||:= ch[1]
ch := @inp
}
ch @pushback
return reserved_word_lookup (s, line_no, column_no)
end
 
procedure scan_integer_literal(inp, pushback)
local ch
local s
local line_no, column_no
 
s := ""
ch := @inp
line_no := ch[2]
column_no := ch[3]
while EOF ~=== ch[1] & any(ident_continuation, ch[1]) do {
s ||:= ch[1]
ch := @inp
}
ch @pushback
not upto(~&digits, s) | invalid_integer_literal(line_no, column_no, s)
return [TOKEN_INTEGER, s, line_no, column_no]
end
 
procedure scan_character_literal(inp, pushback)
local ch, ch1
local close_quote
local toktup
local line_no, column_no
 
ch := @inp # The opening quote.
close_quote := ch[1] # Same as the opening quote.
ch @pushback
 
line_no := ch[2]
column_no := ch[3]
 
toktup := scan_character_literal_without_checking_end(inp)
ch1 := @inp
if ch1[1] ~=== close_quote then {
repeat {
case ch1[1] of {
EOF : unterminated_character_literal(line_no, column_no)
close_quote : multicharacter_literal(line_no, column_no)
default : ch1 := @inp
}
}
}
return toktup
end
 
procedure scan_character_literal_without_checking_end(inp)
local ch, ch1, ch2
 
ch := @inp # The opening quote.
ch1 := @inp
EOF ~=== ch1[1] | unterminated_character_literal(ch[2], ch[3])
if ch1[1] == "\\" then {
ch2 := @inp
EOF ~=== ch2[1] | unterminated_character_literal(ch[2], ch[3])
case ch2[1] of {
"n" : return [TOKEN_INTEGER, string(ord("\n")), ch[2], ch[3]]
"\\" : return [TOKEN_INTEGER, string(ord("\\")), ch[2], ch[3]]
default : unsupported_escape(ch1[2], ch1[3], ch2)
}
} else {
return [TOKEN_INTEGER, string(ord(ch1[1])), ch[2], ch[3]]
}
end
 
procedure scan_string_literal(inp)
local ch, ch1, ch2
local line_no, column_no
local close_quote
local s
local retval
 
ch := @inp # The opening quote
close_quote := ch[1] # Same as the opening quote.
line_no := ch[2]
column_no := ch[3]
 
s := ch[1]
until \retval do {
ch1 := @inp
ch1[1] ~=== EOF |
unterminated_string_literal (line_no, column_no,
"end of input")
ch1[1] ~== "\n" |
unterminated_string_literal (line_no, column_no,
"end of line")
if ch1[1] == close_quote then {
retval := [TOKEN_STRING, s || close_quote, line_no, column_no]
} else if ch1[1] ~== "\\" then {
s ||:= ch1[1]
} else {
ch2 := @inp
EOF ~=== ch2[1] | unsupported_escape(line_no, column_no, ch2)
case ch2[1] of {
"n" : s ||:= "\\n"
"\\" : s ||:= "\\\\"
default : unsupported_escape(line_no, column_no, ch2)
}
}
}
return retval
end
 
procedure skip_spaces_and_comments(inp, pushback)
local ch, ch1
 
repeat {
ch := @inp
(EOF === ch[1]) & { ch @pushback; return }
if not any(whitespace, ch[1]) then {
(ch[1] == "/") | { ch @pushback; return }
(ch1 := @inp) | { ch @pushback; return }
(ch1[1] == "*") | { ch1 @pushback; ch @pushback; return }
scan_comment(inp, ch[2], ch[3])
}
}
end
 
procedure scan_comment(inp, line_no, column_no)
local ch, ch1
 
until (\ch)[1] == "*" & (\ch1)[1] == "/" do {
ch := @inp
(EOF === ch[1]) & unterminated_comment(line_no, column_no)
if ch[1] == "*" then {
ch1 := @inp
(EOF === ch[1]) & unterminated_comment(line_no, column_no)
}
}
return
end
 
procedure reserved_word_lookup(s, line_no, column_no)
 
# Lookup is by an extremely simple perfect hash.
 
static reserved_words
static reserved_word_tokens
local hashval, token, toktup
 
initial {
reserved_words := ["if", "print", "else",
"", "putc", "",
"", "while", ""]
reserved_word_tokens :=
[TOKEN_IF, TOKEN_PRINT, TOKEN_ELSE,
TOKEN_IDENTIFIER, TOKEN_PUTC, TOKEN_IDENTIFIER,
TOKEN_IDENTIFIER, TOKEN_WHILE, TOKEN_IDENTIFIER]
}
 
if *s < 2 then {
toktup := [TOKEN_IDENTIFIER, s, line_no, column_no]
} else {
hashval := ((ord(s[1]) + ord(s[2])) % (*reserved_words)) + 1
token := reserved_word_tokens[hashval]
if token = TOKEN_IDENTIFIER | s ~== reserved_words[hashval] then {
toktup := [TOKEN_IDENTIFIER, s, line_no, column_no]
} else {
toktup := [token, s, line_no, column_no]
}
}
return toktup
end
 
procedure print_token(outf, toktup)
static token_names
local s_line, s_column
 
initial {
token_names := ["Keyword_else",
"Keyword_if",
"Keyword_print",
"Keyword_putc",
"Keyword_while",
"Op_multiply",
"Op_divide",
"Op_mod",
"Op_add",
"Op_subtract",
"Op_negate",
"Op_less",
"Op_lessequal",
"Op_greater",
"Op_greaterequal",
"Op_equal",
"Op_notequal",
"Op_not",
"Op_assign",
"Op_and",
"Op_or",
"LeftParen",
"RightParen",
"LeftBrace",
"RightBrace",
"Semicolon",
"Comma",
"Identifier",
"Integer",
"String",
"End_of_input"]
}
 
/outf := FileStream.stdout
s_line := string(toktup[3])
s_column := string(toktup[4])
writes(outf, right (s_line, max(5, *s_line)))
writes(outf, " ")
writes(outf, right (s_column, max(5, *s_column)))
writes(outf, " ")
writes(outf, token_names[toktup[1] + 1])
case toktup[1] of {
TOKEN_IDENTIFIER : writes(outf, " ", toktup[2])
TOKEN_INTEGER : writes(outf, " ", toktup[2])
TOKEN_STRING : writes(outf, " ", toktup[2])
}
write(outf)
return
end
 
procedure inputter(inpf, pushback_buffer)
local buffer
local line_no, column_no
local c
 
buffer := ""
line_no := 1
column_no := 1
 
repeat {
buffer? {
until *pushback_buffer = 0 & pos(0) do {
if *pushback_buffer ~= 0 then {
suspend pop(pushback_buffer)
} else {
c := move(1)
suspend [c, line_no, column_no]
if c == "\n" then {
line_no +:= 1
column_no := 1
} else {
column_no +:= 1
}
}
}
}
(buffer := reads(inpf, 2048)) |
suspend [EOF, line_no, column_no]
}
end
 
procedure unterminated_comment(line_no, column_no)
error("unterminated comment starting at ",
line_no, ":", column_no)
end
 
procedure unexpected_character(line_no, column_no, ch)
error("unexpected character '", ch[1], "' starting at ",
line_no, ":", column_no)
end
 
procedure unterminated_string_literal (line_no, column_no, cause)
error("unterminated string literal (", cause, ") starting at ",
line_no, ":", column_no)
end
 
procedure unsupported_escape (line_no, column_no, ch)
if ch[1] === EOF then {
error("unexpected \\ at end of input",
" starting at ", line_no, ":", column_no)
} else {
error("unsupported escape \\", ch[1],
" starting at ", line_no, ":", column_no)
}
end
 
procedure invalid_integer_literal(line_no, column_no, s)
error("invalid integer literal ", s,
" starting at ", line_no, ":", column_no)
end
 
procedure unterminated_character_literal(line_no, column_no)
error("unterminated character literal starting at ",
line_no, ":", column_no)
end
 
procedure multicharacter_literal(line_no, column_no)
error("unsupported multicharacter literal starting at ",
line_no, ":", column_no)
end
 
procedure error(args[])
write!([FileStream.stderr] ||| args)
exit(1)
end</syntaxhighlight>
 
 
{{out}}
<pre>$ oit -s -o lex lex-in-ObjectIcon.icn && ./lex compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|OCaml}}==
{{works with|OCaml|4.12.1}}
{{trans|ATS}}
 
This is a close translation of the ATS. It may interest the reader to compare the two implementations.
 
(Much of the extra complication in the ATS comes from arrays being a linear type (whose "views" need tending), and from values of linear type having to be local to any function using them. This limitation could have been worked around, and arrays more similar to OCaml arrays could have been used, but at a cost in safety and efficiency.)
 
<syntaxhighlight lang="ocaml">(*------------------------------------------------------------------*)
(* The Rosetta Code lexical analyzer, in OCaml. Based on the ATS. *)
 
(* When you compare this code to the ATS code, please keep in mind
that, although ATS has an ML-like syntax:
 
* The type system is not the same at all.
 
* Most ATS functions are not closures. Those that are will have
special notations such as "<cloref1>" associated with them. *)
 
(*------------------------------------------------------------------*)
(* The following functions are compatible with ASCII. *)
 
let is_digit ichar =
48 <= ichar && ichar <= 57
 
let is_lower ichar =
97 <= ichar && ichar <= 122
 
let is_upper ichar =
65 <= ichar && ichar <= 90
 
let is_alpha ichar =
is_lower ichar || is_upper ichar
 
let is_alnum ichar =
is_digit ichar || is_alpha ichar
 
let is_ident_start ichar =
is_alpha ichar || ichar = 95
 
let is_ident_continuation ichar =
is_alnum ichar || ichar = 95
 
let is_space ichar =
ichar = 32 || (9 <= ichar && ichar <= 13)
 
(*------------------------------------------------------------------*)
 
let reverse_list_to_string lst =
List.rev lst |> List.to_seq |> String.of_seq
 
(*------------------------------------------------------------------*)
(* Character input more like that of C. There are various advantages
and disadvantages to this method, but key points in its favor are:
(a) it is how character input is done in the original ATS code, (b)
Unicode code points are 21-bit positive integers. *)
 
let eof = (-1)
 
let input_ichar channel =
try
int_of_char (input_char channel)
with
| End_of_file -> eof
 
(*------------------------------------------------------------------*)
 
(* The type of an input character. *)
 
module Ch =
struct
type t =
{
ichar : int;
line_no : int;
column_no : int
}
end
 
(*------------------------------------------------------------------*)
(* Inputting with unlimited pushback, and with counting of lines and
columns. *)
 
module Inp =
struct
type t =
{
inpf : in_channel;
pushback : Ch.t list;
line_no : int;
column_no : int
}
 
let of_in_channel inpf =
{ inpf = inpf;
pushback = [];
line_no = 1;
column_no = 1
}
 
let get_ch inp =
match inp.pushback with
| ch :: tail ->
(ch, {inp with pushback = tail})
| [] ->
let ichar = input_ichar inp.inpf in
if ichar = int_of_char '\n' then
({ ichar = ichar;
line_no = inp.line_no;
column_no = inp.column_no },
{ inp with line_no = inp.line_no + 1;
column_no = 1 })
else
({ ichar = ichar;
line_no = inp.line_no;
column_no = inp.column_no },
{ inp with column_no = inp.column_no + 1 })
 
let push_back_ch ch inp =
{inp with pushback = ch :: inp.pushback}
end
 
(*------------------------------------------------------------------*)
(* Tokens, appearing in tuples with arguments, and with line and
column numbers. The tokens are integers, so they can be used as
array indices. *)
 
(* (token, argument, line_no, column_no) *)
type toktup_t = int * string * int * int
 
let token_ELSE = 0
let token_IF = 1
let token_PRINT = 2
let token_PUTC = 3
let token_WHILE = 4
let token_MULTIPLY = 5
let token_DIVIDE = 6
let token_MOD = 7
let token_ADD = 8
let token_SUBTRACT = 9
let token_NEGATE = 10
let token_LESS = 11
let token_LESSEQUAL = 12
let token_GREATER = 13
let token_GREATEREQUAL = 14
let token_EQUAL = 15
let token_NOTEQUAL = 16
let token_NOT = 17
let token_ASSIGN = 18
let token_AND = 19
let token_OR = 20
let token_LEFTPAREN = 21
let token_RIGHTPAREN = 22
let token_LEFTBRACE = 23
let token_RIGHTBRACE = 24
let token_SEMICOLON = 25
let token_COMMA = 26
let token_IDENTIFIER = 27
let token_INTEGER = 28
let token_STRING = 29
let token_END_OF_INPUT = 30
;;
 
(* A *very* simple perfect hash for the reserved words. (Yes, this is
overkill, except for demonstration of the principle.) *)
 
let reserved_words =
[| "if"; "print"; "else"; ""; "putc"; ""; ""; "while"; "" |]
let reserved_word_tokens =
[| token_IF; token_PRINT; token_ELSE; token_IDENTIFIER;
token_PUTC; token_IDENTIFIER; token_IDENTIFIER; token_WHILE;
token_IDENTIFIER |]
 
let reserved_word_lookup s line_no column_no =
if String.length s < 2 then
(token_IDENTIFIER, s, line_no, column_no)
else
let hashval = (int_of_char s.[0] + int_of_char s.[1]) mod 9 in
let token = reserved_word_tokens.(hashval) in
if token = token_IDENTIFIER || s <> reserved_words.(hashval) then
(token_IDENTIFIER, s, line_no, column_no)
else
(token, s, line_no, column_no)
 
(* Token to string lookup. *)
 
let token_names =
[| "Keyword_else";
"Keyword_if";
"Keyword_print";
"Keyword_putc";
"Keyword_while";
"Op_multiply";
"Op_divide";
"Op_mod";
"Op_add";
"Op_subtract";
"Op_negate";
"Op_less";
"Op_lessequal";
"Op_greater";
"Op_greaterequal";
"Op_equal";
"Op_notequal";
"Op_not";
"Op_assign";
"Op_and";
"Op_or";
"LeftParen";
"RightParen";
"LeftBrace";
"RightBrace";
"Semicolon";
"Comma";
"Identifier";
"Integer";
"String";
"End_of_input" |]
 
let token_name token =
token_names.(token)
 
(*------------------------------------------------------------------*)
 
exception Unterminated_comment of int * int
exception Unterminated_character_literal of int * int
exception Multicharacter_literal of int * int
exception End_of_input_in_string_literal of int * int
exception End_of_line_in_string_literal of int * int
exception Unsupported_escape of int * int * int
exception Invalid_integer_literal of int * int * string
exception Unexpected_character of int * int * char
 
(*------------------------------------------------------------------*)
(* Skipping past spaces and comments. (A comment in the target
language is, if you think about it, a kind of space.) *)
 
let scan_comment inp line_no column_no =
let rec loop inp =
let (ch, inp) = Inp.get_ch inp in
if ch.ichar = eof then
raise (Unterminated_comment (line_no, column_no))
else if ch.ichar = int_of_char '*' then
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = eof then
raise (Unterminated_comment (line_no, column_no))
else if ch1.ichar = int_of_char '/' then
inp
else
loop inp
else
loop inp
in
loop inp
 
let skip_spaces_and_comments inp =
let rec loop inp =
let (ch, inp) = Inp.get_ch inp in
if is_space ch.ichar then
loop inp
else if ch.ichar = int_of_char '/' then
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '*' then
scan_comment inp ch.line_no ch.column_no |> loop
else
let inp = Inp.push_back_ch ch1 inp in
let inp = Inp.push_back_ch ch inp in
inp
else
Inp.push_back_ch ch inp
in
loop inp
 
(*------------------------------------------------------------------*)
(* Integer literals, identifiers, and reserved words. *)
 
(* In ATS the predicate for simple scan was supplied by template
expansion, which (typically) produces faster code than passing a
function or closure (although passing either of those could have
been done). Here I pass the predicate as a function/closure. It is
worth contrasting the methods. *)
let rec simple_scan pred lst inp =
let (ch, inp) = Inp.get_ch inp in
if pred ch.ichar then
simple_scan pred (char_of_int ch.ichar :: lst) inp
else
(lst, Inp.push_back_ch ch inp)
 
(* Demonstration of one way to make a new closure in OCaml. (In ATS,
one might see things that look similar but are actually template
operations.) *)
let simple_scan_iic = simple_scan is_ident_continuation
 
let scan_integer_literal inp =
let (ch, inp) = Inp.get_ch inp in
let _ = assert (is_digit ch.ichar) in
let (lst, inp) = simple_scan_iic [char_of_int ch.ichar] inp in
let s = reverse_list_to_string lst in
if List.for_all (fun c -> is_digit (int_of_char c)) lst then
((token_INTEGER, s, ch.line_no, ch.column_no), inp)
else
raise (Invalid_integer_literal (ch.line_no, ch.column_no, s))
 
let scan_identifier_or_reserved_word inp =
let (ch, inp) = Inp.get_ch inp in
let _ = assert (is_ident_start ch.ichar) in
let (lst, inp) = simple_scan_iic [char_of_int ch.ichar] inp in
let s = reverse_list_to_string lst in
let toktup = reserved_word_lookup s ch.line_no ch.column_no in
(toktup, inp)
 
(*------------------------------------------------------------------*)
(* String literals. *)
 
let scan_string_literal inp =
let (ch, inp) = Inp.get_ch inp in
let _ = assert (ch.ichar = int_of_char '"') in
 
let rec scan lst inp =
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = eof then
raise (End_of_input_in_string_literal
(ch.line_no, ch.column_no))
else if ch1.ichar = int_of_char '\n' then
raise (End_of_line_in_string_literal
(ch.line_no, ch.column_no))
else if ch1.ichar = int_of_char '"' then
(lst, inp)
else if ch1.ichar <> int_of_char '\\' then
scan (char_of_int ch1.ichar :: lst) inp
else
let (ch2, inp) = Inp.get_ch inp in
if ch2.ichar = int_of_char 'n' then
scan ('n' :: '\\' :: lst) inp
else if ch2.ichar = int_of_char '\\' then
scan ('\\' :: '\\' :: lst) inp
else
raise (Unsupported_escape (ch1.line_no, ch1.column_no,
ch2.ichar))
in
let lst = '"' :: [] in
let (lst, inp) = scan lst inp in
let lst = '"' :: lst in
let s = reverse_list_to_string lst in
((token_STRING, s, ch.line_no, ch.column_no), inp)
 
(*------------------------------------------------------------------*)
(* Character literals. *)
 
let scan_character_literal_without_checking_end inp =
let (ch, inp) = Inp.get_ch inp in
let _ = assert (ch.ichar = int_of_char '\'') in
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = eof then
raise (Unterminated_character_literal
(ch.line_no, ch.column_no))
else if ch1.ichar = int_of_char '\\' then
let (ch2, inp) = Inp.get_ch inp in
if ch2.ichar = eof then
raise (Unterminated_character_literal
(ch.line_no, ch.column_no))
else if ch2.ichar = int_of_char 'n' then
let s = (int_of_char '\n' |> string_of_int) in
((token_INTEGER, s, ch.line_no, ch.column_no), inp)
else if ch2.ichar = int_of_char '\\' then
let s = (int_of_char '\\' |> string_of_int) in
((token_INTEGER, s, ch.line_no, ch.column_no), inp)
else
raise (Unsupported_escape
(ch1.line_no, ch1.column_no, ch2.ichar))
else
let s = string_of_int ch1.ichar in
((token_INTEGER, s, ch.line_no, ch.column_no), inp)
 
let scan_character_literal inp =
let (toktup, inp) =
scan_character_literal_without_checking_end inp in
let (_, _, line_no, column_no) = toktup in
 
let check_end inp =
let (ch, inp) = Inp.get_ch inp in
if ch.ichar = int_of_char '\'' then
inp
else
let rec loop_to_end (ch1 : Ch.t) inp =
if ch1.ichar = eof then
raise (Unterminated_character_literal (line_no, column_no))
else if ch1.ichar = int_of_char '\'' then
raise (Multicharacter_literal (line_no, column_no))
else
let (ch1, inp) = Inp.get_ch inp in
loop_to_end ch1 inp
in
loop_to_end ch inp
in
let inp = check_end inp in
(toktup, inp)
 
(*------------------------------------------------------------------*)
 
let get_next_token inp =
let inp = skip_spaces_and_comments inp in
let (ch, inp) = Inp.get_ch inp in
let ln = ch.line_no in
let cn = ch.column_no in
if ch.ichar = eof then
((token_END_OF_INPUT, "", ln, cn), inp)
else
match char_of_int ch.ichar with
| ',' -> ((token_COMMA, ",", ln, cn), inp)
| ';' -> ((token_SEMICOLON, ";", ln, cn), inp)
| '(' -> ((token_LEFTPAREN, "(", ln, cn), inp)
| ')' -> ((token_RIGHTPAREN, ")", ln, cn), inp)
| '{' -> ((token_LEFTBRACE, "{", ln, cn), inp)
| '}' -> ((token_RIGHTBRACE, "}", ln, cn), inp)
| '*' -> ((token_MULTIPLY, "*", ln, cn), inp)
| '/' -> ((token_DIVIDE, "/", ln, cn), inp)
| '%' -> ((token_MOD, "%", ln, cn), inp)
| '+' -> ((token_ADD, "+", ln, cn), inp)
| '-' -> ((token_SUBTRACT, "-", ln, cn), inp)
| '<' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '=' then
((token_LESSEQUAL, "<=", ln, cn), inp)
else
let inp = Inp.push_back_ch ch1 inp in
((token_LESS, "<", ln, cn), inp)
| '>' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '=' then
((token_GREATEREQUAL, ">=", ln, cn), inp)
else
let inp = Inp.push_back_ch ch1 inp in
((token_GREATER, ">", ln, cn), inp)
| '=' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '=' then
((token_EQUAL, "==", ln, cn), inp)
else
let inp = Inp.push_back_ch ch1 inp in
((token_ASSIGN, "=", ln, cn), inp)
| '!' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '=' then
((token_NOTEQUAL, "!=", ln, cn), inp)
else
let inp = Inp.push_back_ch ch1 inp in
((token_NOT, "!", ln, cn), inp)
| '&' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '&' then
((token_AND, "&&", ln, cn), inp)
else
raise (Unexpected_character (ch.line_no, ch.column_no,
char_of_int ch.ichar))
| '|' ->
let (ch1, inp) = Inp.get_ch inp in
if ch1.ichar = int_of_char '|' then
((token_OR, "||", ln, cn), inp)
else
raise (Unexpected_character (ch.line_no, ch.column_no,
char_of_int ch.ichar))
| '"' ->
let inp = Inp.push_back_ch ch inp in
scan_string_literal inp
| '\'' ->
let inp = Inp.push_back_ch ch inp in
scan_character_literal inp
| _ when is_digit ch.ichar ->
let inp = Inp.push_back_ch ch inp in
scan_integer_literal inp
| _ when is_ident_start ch.ichar ->
let inp = Inp.push_back_ch ch inp in
scan_identifier_or_reserved_word inp
| _ -> raise (Unexpected_character (ch.line_no, ch.column_no,
char_of_int ch.ichar))
 
let print_token outf toktup =
let (token, arg, line_no, column_no) = toktup in
let name = token_name token in
let (padding, str) =
match 0 with
| _ when token = token_IDENTIFIER -> (" ", arg)
| _ when token = token_INTEGER -> (" ", arg)
| _ when token = token_STRING -> (" ", arg)
| _ -> ("", "")
in
Printf.fprintf outf "%5d %5d %s%s%s\n"
line_no column_no name padding str
 
let scan_text outf inp =
let rec loop inp =
let (toktup, inp) = get_next_token inp in
begin
print_token outf toktup;
let (token, _, _, _) = toktup in
if token <> token_END_OF_INPUT then
loop inp
end
in
loop inp
 
(*------------------------------------------------------------------*)
 
let main () =
let inpf_filename =
if 2 <= Array.length Sys.argv then
Sys.argv.(1)
else
"-"
in
let outf_filename =
if 3 <= Array.length Sys.argv then
Sys.argv.(2)
else
"-"
in
let inpf =
if inpf_filename = "-" then
stdin
else
open_in inpf_filename
in
let outf =
if outf_filename = "-" then
stdout
else
open_out outf_filename
in
let inp = Inp.of_in_channel inpf in
scan_text outf inp
;;
 
main ()
 
(*------------------------------------------------------------------*)</syntaxhighlight>
 
{{out}}
<pre>$ ocamlopt -O2 lex.ml && ./a.out compiler-tests/testcase3.t
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|Ol}}==
Line 9,637 ⟶ 13,924:
Note: we do not print the line and token source code position for the simplicity.
 
<langsyntaxhighlight lang="scheme">
(import (owl parse))
 
Line 9,761 ⟶ 14,048:
(if (null? (cdr stream))
(print 'End_of_input))))
</syntaxhighlight>
</lang>
 
==== Testing ====
 
Testing function:
<langsyntaxhighlight lang="scheme">
(define (translate source)
(let ((stream (try-parse token-parser (str-iter source) #t)))
Line 9,772 ⟶ 14,059:
(if (null? (force (cdr stream)))
(print 'End_of_input))))
</syntaxhighlight>
</lang>
 
====== Testcase 1 ======
 
<langsyntaxhighlight lang="scheme">
(translate "
/*
Line 9,782 ⟶ 14,069:
*/
print(\"Hello, World!\\\\n\");
")</langsyntaxhighlight>
{{Out}}
<pre>
Line 9,795 ⟶ 14,082:
====== Testcase 2 ======
 
<langsyntaxhighlight lang="scheme">
(translate "
/*
Line 9,802 ⟶ 14,089:
phoenix_number = 142857;
print(phoenix_number, \"\\\\n\");
")</langsyntaxhighlight>
{{Out}}
<pre>
Line 9,821 ⟶ 14,108:
====== Testcase 3 ======
 
<langsyntaxhighlight lang="scheme">
(translate "
/*
Line 9,845 ⟶ 14,132:
/* character literal */ '\\\\'
/* character literal */ ' '
")</langsyntaxhighlight>
{{Out}}
<pre>
Line 9,886 ⟶ 14,173:
====== Testcase 4 ======
 
<langsyntaxhighlight lang="scheme">
(translate "
/*** test printing, embedded \\\\n and comments with lots of '*' ***/
Line 9,893 ⟶ 14,180:
print(\"Print a slash n - \\\\\\\\n.\\\\n\");
")
</syntaxhighlight>
</lang>
{{Out}}
<pre>
Line 9,916 ⟶ 14,203:
=={{header|Perl}}==
 
<langsyntaxhighlight lang="perl">#!/usr/bin/env perl
 
use strict;
Line 10,055 ⟶ 14,342:
($line, $col)
}
}</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 10,098 ⟶ 14,385:
===Alternate Perl Solution===
Tested on perl v5.26.1
<langsyntaxhighlight Perllang="perl">#!/usr/bin/perl
 
use strict; # lex.pl - source to tokens
Line 10,134 ⟶ 14,421:
1 + $` =~ tr/\n//, 1 + length $` =~ s/.*\n//sr, $^R;
}
printf "%5d %7d %s\n", 1 + tr/\n//, 1, 'End_of_input';</langsyntaxhighlight>
 
=={{header|Phix}}==
Line 10,141 ⟶ 14,428:
form. If required, demo\rosetta\Compiler\extra.e (below) contains some code that achieves the latter.
Code to print the human readable forms is likewise kept separate from any re-usable parts.
<!--<langsyntaxhighlight Phixlang="phix">(phixonline)-->
<span style="color: #000080;font-style:italic;">--
-- demo\rosetta\Compiler\core.e
Line 10,301 ⟶ 14,588:
<span style="color: #008080;">return</span> <span style="color: #000000;">s</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<!--</langsyntaxhighlight>-->
For running under pwa/p2js, we also have a "fake file/io" component:
<!--<langsyntaxhighlight Phixlang="phix">(phixonline)-->
<span style="color: #000080;font-style:italic;">--
-- demo\rosetta\Compiler\js_io.e
Line 10,312 ⟶ 14,599:
--</span>
<span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span>
<span style="color: #008080004080;">constantsequence</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">known_files</span><span style="color: #0000FF;">,</span><span style="color: #000000;">kfc</span><span style="color: #0000FF;">}</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">columnize</span><span style="color: #0000FF;">({</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"test3.c"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
/*
Line 10,377 ⟶ 14,664:
}
print(a);
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"Header.h"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
#define area(h, w) h * w
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"Source.t"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
#include "Header.h"
#define width 5
#define height 6
area = #area(height, width)#;
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)}})</span>
<span style="color: #004080;">integersequence</span> <span style="color: #000000;">fnlinenos</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">repeat</span><span style="color: #0000FF;">(-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">linenoknown_files</span><span style="color: #0000FF;">))</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">js_open</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">filename</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">filename</span><span style="color: #0000FF;">,</span><span style="color: #000000;">known_files</span><span style="color: #0000FF;">)</span>
<span style="color: #7060A8;">assert</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">!=</span><span style="color: #000000;">0</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">linenolinenos</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">fn</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">js_gets</span><span style="color: #0000FF;">(</span><span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">lineno</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">linenos</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">]+</span><span style="color: #000000;">1</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">lineno</span><span style="color: #0000FF;">><=</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">kfc</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">])</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #000000;">EOF</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">kfclinenos</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">][</span> <span style="color: #0000000000FF;">lineno=</span> <span style="color: #0000FF000000;">]lineno</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">kfc</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">][</span><span style="color: #000000;">lineno</span><span style="color: #0000FF;">]</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">EOF</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<!--</langsyntaxhighlight>-->
The main lexer is also written to be reusable by later stages.
<!--<langsyntaxhighlight Phixlang="phix">(phixonline)-->
<span style="color: #000080;font-style:italic;">--
-- demo\\rosetta\\Compiler\\lex.e
Line 10,423 ⟶ 14,722:
<span style="color: #000000;">line</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span>
<span style="color: #000000;">col</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
<span style="color: #000000;">oneline</span> <span style="color: #0000FF;">=</span> <span style="color: #008080;">iff</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">platform</span><span style="color: #0000FF;">()=</span><span style="color: #004600;">JS</span><span style="color: #0000FF;">?</span><span style="color: #000000;">js_gets</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">)</span>
<span style="color: #0000FF;">:</span><span style="color: #7060A8;">gets</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">))</span>
<span style="color: #008080;">else</span>
Line 10,433 ⟶ 14,732:
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #000080;font-style:italic;">-- for pwa/p2js (JavaScript *really* dislikes tabs in strings):
--constant whitespace = " \t\r\n\x0B\xA0"</span>
<span style="color: #008080;">constant</span> <span style="color: #000000;">whitespace</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\t'</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\r'</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\n'</span><span style="color: #0000FF;">,</span><span style="color: #000000;">#0B</span><span style="color: #0000FF;">,</span><span style="color: #000000;">#A0</span><span style="color: #0000FF;">}</span>
<span style="color: #000080;font-style:italic;">-- (0x0B is Vertical Tab, 0xA0 is Non-breaking space)</span>
Line 10,513 ⟶ 14,814:
<span style="color: #008080;">function</span> <span style="color: #000000;">get_op</span><span style="color: #0000FF;">()</span>
<span style="color: #000080;font-style:italic;">-- sequence operator = {ch}</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">operator</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span><span style="color: #0000FF;">&</span><span style="color: #000000;">ch</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
Line 10,579 ⟶ 14,881:
<span style="color: #008080;">return</span> <span style="color: #000000;">toks</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<!--</langsyntaxhighlight>-->
Optional: if you need human-readable output/input at each (later) stage, so you can use pipes
<!--<langsyntaxhighlight Phixlang="phix">-->
<span style="color: #000080;font-style:italic;">--
-- demo\rosetta\Compiler\extra.e
Line 10,634 ⟶ 14,936:
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">n_type</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">left</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">right</span><span style="color: #0000FF;">}</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<!--</langsyntaxhighlight>-->
Finally, a simple test driver for the specific task:
<!--<langsyntaxhighlight Phixlang="phix">(phixonline)-->
<span style="color: #000080;font-style:italic;">--
-- demo\rosetta\Compiler\lex.exw
Line 10,664 ⟶ 14,966:
<span style="color: #000080;font-style:italic;">--main(command_line())</span>
<span style="color: #000000;">main</span><span style="color: #0000FF;">({</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"test4.c"</span><span style="color: #0000FF;">})</span>
<!--</langsyntaxhighlight>-->
{{out}}
<pre>
Line 10,687 ⟶ 14,989:
=={{header|Prolog}}==
 
<langsyntaxhighlight lang="prolog">/*
Test harness for the analyzer, not needed if we are actually using the output.
*/
Line 10,847 ⟶ 15,149:
 
% anything else is an error
tok(_,_,L,P) --> { format(atom(Error), 'Invalid token at line ~d,~d', [L,P]), throw(Error) }.</langsyntaxhighlight>
{{out}}
<pre>
Line 10,888 ⟶ 15,190:
=={{header|Python}}==
Tested with Python 2.7 and 3.x
<langsyntaxhighlight Pythonlang="python">from __future__ import print_function
import sys
 
Line 10,969 ⟶ 15,271:
#*** "string"
def string_lit(start, err_line, err_col):
global the_ch
text = ""
 
Line 10,976 ⟶ 15,279:
if the_ch == '\n':
error(err_line, err_col, "EOL while scanning string literal")
if the_ch == '\\':
next_ch()
if the_ch != 'n':
error(err_line, err_col, "escape sequence unknown \\%c" % the_ch)
the_ch = '\n'
text += the_ch
 
Line 11,063 ⟶ 15,371:
 
if tok == tk_EOI:
break</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 11,107 ⟶ 15,415:
=={{header|QB64}}==
Tested with QB64 1.5
<langsyntaxhighlight lang="vb">dim shared source as string, the_ch as string, tok as string, toktyp as string
dim shared line_n as integer, col_n as integer, text_p as integer, err_line as integer, err_col as integer, errors as integer
 
Line 11,347 ⟶ 15,655:
end
end sub
</syntaxhighlight>
</lang>
{{out|case=test case 3}}
<b>
Line 11,387 ⟶ 15,695:
 
=={{header|Racket}}==
<langsyntaxhighlight lang="racket">
#lang racket
(require parser-tools/lex)
Line 11,543 ⟶ 15,851:
"TEST 5"
(display-tokens (string->tokens test5))
</syntaxhighlight>
</lang>
 
=={{header|Raku}}==
Line 11,553 ⟶ 15,861:
{{works with|Rakudo|2016.08}}
 
<syntaxhighlight lang="raku" perl6line>grammar tiny_C {
rule TOP { ^ <.whitespace>? <tokens> + % <.whitespace> <.whitespace> <eoi> }
 
Line 11,646 ⟶ 15,954:
 
my $tokenizer = tiny_C.parse(@*ARGS[0].IO.slurp);
parse_it( $tokenizer );</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 11,685 ⟶ 15,993:
23 1 End_of_input
</pre>
 
=={{header|RATFOR}}==
{{works with|ratfor77|[https://sourceforge.net/p/chemoelectric/ratfor77/ public domain 1.0]}}
{{works with|gfortran|11.2.1}}
{{works with|f2c|20100827}}
 
 
<syntaxhighlight lang="ratfor">######################################################################
#
# The Rosetta Code scanner in Ratfor 77.
#
#
# How to deal with FORTRAN 77 input is a problem. I use formatted
# input, treating each line as an array of type CHARACTER--regrettably
# of no more than some predetermined, finite length. It is a very
# simple method and presents no significant difficulties, aside from
# the restriction on line length of the input.
#
#
# On a POSIX platform, the program can be compiled with f2c and run
# somewhat as follows:
#
# ratfor77 lex-in-ratfor.r > lex-in-ratfor.f
# f2c -C -Nc40 lex-in-ratfor.f
# cc -O lex-in-ratfor.c -lf2c
# ./a.out < compiler-tests/primes.t
#
# With gfortran, a little differently:
#
# ratfor77 lex-in-ratfor.r > lex-in-ratfor.f
# gfortran -O2 -fcheck=all -std=legacy lex-in-ratfor.f
# ./a.out < compiler-tests/primes.t
#
#
# I/O is strictly from default input and to default output, which, on
# POSIX systems, usually correspond respectively to standard input and
# standard output. (I did not wish to have to deal with unit numbers;
# these are now standardized in ISO_FORTRAN_ENV, but that is not
# available in FORTRAN 77.)
#
#---------------------------------------------------------------------
 
# Some parameters you may with to modify.
 
define(LINESZ, 256) # Size of an input line.
define(OUTLSZ, 512) # Size of an output line.
define(PSHBSZ, 10) # Size of the character pushback buffer.
define(STRNSZ, 4096) # Size of the string pool.
 
#---------------------------------------------------------------------
 
define(EOF, -1)
define(NEWLIN, 10) # Unix newline (the LF control character).
define(BACKSL, 92) # ASCII backslash.
 
define(ILINNO, 1) # Line number's index.
define(ICOLNO, 2) # Column number's index.
 
define(CHRSZ, 3) # See ILINNO and ICOLNO above.
define(ICHRCD, 3) # Character code's index.
 
define(TOKSZ, 5) # See ILINNO and ICOLNO above.
define(ITOKNO, 3) # Token number's index.
define(IARGIX, 4) # Index of the string pool index.
define(IARGLN, 5) # Index of the string length.
 
define(TKELSE, 0)
define(TKIF, 1)
define(TKPRNT, 2)
define(TKPUTC, 3)
define(TKWHIL, 4)
define(TKMUL, 5)
define(TKDIV, 6)
define(TKMOD, 7)
define(TKADD, 8)
define(TKSUB, 9)
define(TKNEG, 10)
define(TKLT, 11)
define(TKLE, 12)
define(TKGT, 13)
define(TKGE, 14)
define(TKEQ, 15)
define(TKNE, 16)
define(TKNOT, 17)
define(TKASGN, 18)
define(TKAND, 19)
define(TKOR, 20)
define(TKLPAR, 21)
define(TKRPAR, 22)
define(TKLBRC, 23)
define(TKRBRC, 24)
define(TKSEMI, 25)
define(TKCMMA, 26)
define(TKID, 27)
define(TKINT, 28)
define(TKSTR, 29)
define(TKEOI, 30)
 
define(LOC10, 1) # Location of "10" in the string pool.
define(LOC92, 3) # Location of "92" in the string pool.
 
#---------------------------------------------------------------------
 
subroutine addstr (strngs, istrng, src, i0, n0, i, n)
 
# Add a string to the string pool.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character src(*) # Source string.
integer i0, n0 # Index and length in source string.
integer i, n # Index and length in string pool.
 
integer j
 
if (STRNSZ < istrng + (n0 - 1))
{
write (*, '(''string pool exhausted'')')
stop
}
for (j = 0; j < n0; j = j + 1)
strngs(istrng + j) = src(i0 + j)
i = istrng
n = n0
istrng = istrng + n0
end
 
subroutine cpystr (strngs, i, n, dst, i0)
 
# Copy a string from the string pool to an output string.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer i, n # Index and length in string pool.
character dst(OUTLSZ) # Destination string.
integer i0 # Index within destination string.
 
integer j
 
if (i0 < 1 || OUTLSZ < i0 + (n - 1))
{
write (*, '(''string boundary exceeded'')')
stop
}
for (j = 0; j < n; j = j + 1)
dst(i0 + j) = strngs(i + j)
end
 
#---------------------------------------------------------------------
 
subroutine getchr (line, linno, colno, pushbk, npshbk, chr)
 
# Get a character, with its line number and column number.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer chr(CHRSZ)
 
# End of file is indicated (as in C) by a negative "char code"
# called "EOF".
 
character*20 fmt
integer stat
integer chr1(CHRSZ)
 
if (0 < npshbk)
{
chr(ICHRCD) = pushbk(ICHRCD, npshbk)
chr(ILINNO) = pushbk(ILINNO, npshbk)
chr(ICOLNO) = pushbk(ICOLNO, npshbk)
npshbk = npshbk - 1
}
else if (colno <= LINESZ)
{
chr(ICHRCD) = ichar (line(colno))
chr(ILINNO) = linno
chr(ICOLNO) = colno
colno = colno + 1
}
else
{
# Return a newline character.
chr(ICHRCD) = NEWLIN
chr(ILINNO) = linno
chr(ICOLNO) = colno
 
# Fetch a new line.
linno = linno + 1
colno = 1
 
# Read a line of text as an array of characters.
write (fmt, '(''('', I10, ''A1)'')') LINESZ
read (*, fmt, iostat = stat) line
 
if (stat != 0)
{
# If end of file has been reached, push an EOF.
chr1(ICHRCD) = EOF
chr1(ILINNO) = linno
chr1(ICOLNO) = colno
call pshchr (pushbk, npshbk, chr1)
}
}
end
 
subroutine pshchr (pushbk, npshbk, chr)
 
# Push back a character.
 
implicit none
 
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer chr(CHRSZ)
 
if (PSHBSZ <= npshbk)
{
write (*, '(''pushback buffer overfull'')')
stop
}
npshbk = npshbk + 1
pushbk(ICHRCD, npshbk) = chr(ICHRCD)
pushbk(ILINNO, npshbk) = chr(ILINNO)
pushbk(ICOLNO, npshbk) = chr(ICOLNO)
end
 
subroutine getpos (line, linno, colno, pushbk, npshbk, ln, cn)
 
# Get the position of the next character.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # The line and column nos. returned.
 
integer chr(CHRSZ)
 
call getchr (line, linno, colno, pushbk, npshbk, chr)
ln = chr(ILINNO)
cn = chr(ICOLNO)
call pshchr (pushbk, npshbk, chr)
end
 
#---------------------------------------------------------------------
 
function isspc (c)
 
# Is c character code for a space?
 
implicit none
 
integer c
logical isspc
 
#
# The following is correct for ASCII: 32 is the SPACE character, and
# 9 to 13 are control characters commonly regarded as spaces.
#
# In Unicode these are all code points for spaces, but so are others
# besides.
#
isspc = (c == 32 || (9 <= c && c <= 13))
end
 
function isdgt (c)
 
# Is c character code for a digit?
 
implicit none
 
integer c
logical isdgt
 
isdgt = (ichar ('0') <= c && c <= ichar ('9'))
end
 
function isalph (c)
 
# Is c character code for a letter?
 
implicit none
 
integer c
logical isalph
 
#
# The following is correct for ASCII and Unicode, but not for
# EBCDIC.
#
isalph = (ichar ('a') <= c && c <= ichar ('z')) _
|| (ichar ('A') <= c && c <= ichar ('Z'))
end
 
function isid0 (c)
 
# Is c character code for the start of an identifier?
 
implicit none
 
integer c
logical isid0
 
logical isalph
 
isid0 = isalph (c) || c == ichar ('_')
end
 
function isid1 (c)
 
# Is c character code for the continuation of an identifier?
 
implicit none
 
integer c
logical isid1
 
logical isalph
logical isdgt
 
isid1 = isalph (c) || isdgt (c) || c == ichar ('_')
end
 
#---------------------------------------------------------------------
 
function trimlf (str, n)
 
# "Trim left" leading spaces.
 
implicit none
 
character str(*) # The string to "trim".
integer n # The length.
integer trimlf # The index of the first non-space
# character, or n + 1.
 
logical isspc
 
integer j
logical done
 
j = 1
done = .false.
while (!done)
{
if (j == n + 1)
done = .true.
else if (!isspc (ichar (str(j))))
done = .true.
else
j = j + 1
}
trimlf = j
end
 
function trimrt (str, n)
 
# "Trim right" trailing spaces.
 
implicit none
 
character str(*) # The string to "trim".
integer n # The length including trailing spaces.
integer trimrt # The length without trailing spaces.
 
logical isspc
 
integer j
logical done
 
j = n
done = .false.
while (!done)
{
if (j == 0)
done = .true.
else if (!isspc (ichar (str(j))))
done = .true.
else
j = j - 1
}
trimrt = j
end
 
#---------------------------------------------------------------------
 
subroutine toknam (tokno, str, i)
 
# Copy a token name to the character array str, starting at i.
 
implicit none
 
integer tokno
character str(*)
integer i
integer j
 
character*16 names(0:30)
character*16 nm
 
data names / "Keyword_else ", _
"Keyword_if ", _
"Keyword_print ", _
"Keyword_putc ", _
"Keyword_while ", _
"Op_multiply ", _
"Op_divide ", _
"Op_mod ", _
"Op_add ", _
"Op_subtract ", _
"Op_negate ", _
"Op_less ", _
"Op_lessequal ", _
"Op_greater ", _
"Op_greaterequal ", _
"Op_equal ", _
"Op_notequal ", _
"Op_not ", _
"Op_assign ", _
"Op_and ", _
"Op_or ", _
"LeftParen ", _
"RightParen ", _
"LeftBrace ", _
"RightBrace ", _
"Semicolon ", _
"Comma ", _
"Identifier ", _
"Integer ", _
"String ", _
"End_of_input " /
 
nm = names(tokno)
for (j = 0; j < 16; j = j + 1)
str(i + j) = nm(1 + j : 1 + j)
end
 
subroutine intstr (str, i, n, x)
 
# Convert a positive integer to a substring.
 
implicit none
 
character str(*) # Destination string.
integer i, n # Index and length of the field.
integer x # The positive integer to represent.
 
integer j
integer y
 
if (x == 0)
{
for (j = 0; j < n - 1; j = j + 1)
str(i + j) = ' '
str(i + j) = '0'
}
else
{
y = x
for (j = n - 1; 0 <= j; j = j - 1)
{
if (y == 0)
str(i + j) = ' '
else
{
str(i + j) = char (mod (y, 10) + ichar ('0'))
y = y / 10
}
}
}
end
 
subroutine prttok (strngs, tok)
 
# Print a token.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer tok(TOKSZ) # The token.
 
integer trimrt
 
character line(OUTLSZ)
character*20 fmt
integer i, n
integer tokno
 
for (i = 1; i <= OUTLSZ; i = i + 1)
line(i) = ' '
 
call intstr (line, 1, 10, tok(ILINNO))
call intstr (line, 12, 10, tok(ICOLNO))
 
tokno = tok(ITOKNO)
call toknam (tokno, line, 25)
if (tokno == TKID || tokno == TKINT || tokno == TKSTR)
{
i = tok(IARGIX)
n = tok(IARGLN)
call cpystr (strngs, i, n, line, 45)
}
 
n = trimrt (line, OUTLSZ)
write (fmt, '(''('', I10, ''A)'')') n
write (*, fmt) (line(i), i = 1, n)
end
 
#---------------------------------------------------------------------
 
subroutine wrtpos (ln, cn)
 
implicit none
 
integer ln, cn
 
write (*, 1000) ln, cn
1000 format ('At line ', I5, ', column ' I5)
end
 
#---------------------------------------------------------------------
 
subroutine utcmnt (ln, cn)
 
implicit none
 
integer ln, cn
 
call wrtpos (ln, cn)
write (*, '(''Unterminated comment'')')
stop
end
 
subroutine skpcmt (line, linno, colno, pushbk, npshbk, ln, cn)
 
# Skip to the end of a comment.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # Line and column of start of comment.
 
integer chr(CHRSZ)
logical done
 
done = .false.
while (!done)
{
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (chr(ICHRCD) == ichar ('*'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (chr(ICHRCD) == ichar ('/'))
done = .true.
else if (chr(ICHRCD) == EOF)
call utcmnt (ln, cn)
}
else if (chr(ICHRCD) == EOF)
call utcmnt (ln, cn)
}
end
 
subroutine skpspc (line, linno, colno, pushbk, npshbk)
 
# Skip spaces and comments.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
 
logical isspc
 
integer chr(CHRSZ)
integer chr1(CHRSZ)
integer ln, cn
logical done
 
done = .false.
while (!done)
{
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (!isspc (chr(ICHRCD)))
{
if (chr(ICHRCD) != ichar ('/'))
{
call pshchr (pushbk, npshbk, chr)
done = .true.
}
else
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) != ichar ('*'))
{
call pshchr (pushbk, npshbk, chr1)
call pshchr (pushbk, npshbk, chr)
done = .true.
}
else
{
ln = chr(ILINNO)
cn = chr(ICOLNO)
call skpcmt (line, linno, colno, pushbk, npshbk, _
ln, cn)
}
}
}
}
end
 
#---------------------------------------------------------------------
 
subroutine rwdlkp (strngs, istrng, src, i0, n0, ln, cn, tok)
 
# Reserved word lookup
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character src(*) # The source string.
integer i0, n0 # Index and length of the substring.
integer ln, cn # Line and column number
# to associate with the token.
integer tok(TOKSZ) # The output token.
 
integer tokno
integer i, n
 
tokno = TKID
 
if (n0 == 2)
{
if (src(i0) == 'i' && src(i0 + 1) == 'f')
tokno = TKIF
}
else if (n0 == 4)
{
if (src(i0) == 'e' && src(i0 + 1) == 'l' _
&& src(i0 + 2) == 's' && src(i0 + 3) == 'e')
tokno = TKELSE
else if (src(i0) == 'p' && src(i0 + 1) == 'u' _
&& src(i0 + 2) == 't' && src(i0 + 3) == 'c')
tokno = TKPUTC
}
else if (n0 == 5)
{
if (src(i0) == 'p' && src(i0 + 1) == 'r' _
&& src(i0 + 2) == 'i' && src(i0 + 3) == 'n' _
&& src(i0 + 4) == 't')
tokno = TKPRNT
else if (src(i0) == 'w' && src(i0 + 1) == 'h' _
&& src(i0 + 2) == 'i' && src(i0 + 3) == 'l' _
&& src(i0 + 4) == 'e')
tokno = TKWHIL
}
 
i = 0
n = 0
if (tokno == TKID)
call addstr (strngs, istrng, src, i0, n0, i, n)
 
tok(ITOKNO) = tokno
tok(IARGIX) = i
tok(IARGLN) = n
tok(ILINNO) = ln
tok(ICOLNO) = cn
end
 
subroutine scnwrd (line, linno, colno, pushbk, npshbk, ln, cn, buf, n)
 
# Scan characters that may represent an identifier, reserved word,
# or integer literal.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # Line and column number of the start.
character buf(LINESZ) # The output buffer.
integer n # The length of the string collected.
 
logical isid1
 
integer chr(CHRSZ)
 
n = 0
call getchr (line, linno, colno, pushbk, npshbk, chr)
ln = chr(ILINNO)
cn = chr(ICOLNO)
while (isid1 (chr(ICHRCD)))
{
n = n + 1
buf(n) = char (chr(ICHRCD))
call getchr (line, linno, colno, pushbk, npshbk, chr)
}
call pshchr (pushbk, npshbk, chr)
end
 
subroutine scnidr (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan an identifier or reserved word.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # Line and column number of the start.
integer tok(TOKSZ) # The output token.
 
character buf(LINESZ)
integer n
call scnwrd (line, linno, colno, pushbk, npshbk, ln, cn, buf, n)
call rwdlkp (strngs, istrng, buf, 1, n, ln, cn, tok)
end
 
subroutine scnint (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a positive integer literal.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # Line and column number of the start.
integer tok(TOKSZ) # The output token.
 
logical isdgt
 
character buf(LINESZ)
integer n0, n
integer i, j, k
character*80 fmt
call scnwrd (line, linno, colno, pushbk, npshbk, ln, cn, buf, n0)
for (j = 1; j <= n0; j = j + 1)
if (!isdgt (ichar (buf(j))))
{
call wrtpos (ln, cn)
write (fmt, 1000) n0
1000 format ('(''Not a legal word: "''', I10, 'A, ''"'')')
write (*, fmt) (buf(k), k = 1, n0)
stop
}
 
call addstr (strngs, istrng, buf, 1, n0, i, n)
 
tok(ITOKNO) = TKINT
tok(IARGIX) = i
tok(IARGLN) = n
tok(ILINNO) = ln
tok(ICOLNO) = cn
end
 
subroutine utclit (ln, cn)
 
implicit none
 
integer ln, cn
 
call wrtpos (ln, cn)
write (*, '(''Unterminated character literal'')')
stop
end
 
subroutine scnch1 (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a character literal, without yet checking that the literal
# ends correctly.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer tok(TOKSZ) # The output token.
 
integer trimlf
 
integer bufsz
parameter (bufsz = 40)
 
integer chr(CHRSZ)
integer chr1(CHRSZ)
integer chr2(CHRSZ)
integer ln, cn
character buf(bufsz)
integer i, j, n
 
# Refetch the opening quote.
call getchr (line, linno, colno, pushbk, npshbk, chr)
ln = chr(ILINNO)
cn = chr(ICOLNO)
 
tok(ITOKNO) = TKINT
tok(ILINNO) = ln
tok(ICOLNO) = cn
 
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == EOF)
call utclit (ln, cn)
if (chr1(ICHRCD) == BACKSL)
{
call getchr (line, linno, colno, pushbk, npshbk, chr2)
if (chr2(ICHRCD) == EOF)
call utclit (ln, cn)
else if (chr2(ICHRCD) == ichar ('n'))
{
tok(IARGIX) = LOC10 # "10" = code for Unix newline
tok(IARGLN) = 2
}
else if (chr2(ICHRCD) == BACKSL)
{
tok(IARGIX) = LOC92 # "92" = code for backslash
tok(IARGLN) = 2
}
else
{
call wrtpos (ln, cn)
write (*, '(''Unsupported escape: '', 1A)') _
char (chr2(ICHRCD))
stop
}
}
else
{
# Character codes are non-negative, so we can use intstr.
call intstr (buf, 1, bufsz, chr1(ICHRCD))
 
j = trimlf (buf, bufsz)
call addstr (strngs, istrng, buf, j, bufsz - (j - 1), i, n)
tok(IARGIX) = i
tok(IARGLN) = n
}
end
 
subroutine scnch (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a character literal.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer tok(TOKSZ) # The output token.
 
integer ln, cn
integer chr(CHRSZ)
 
call getpos (line, linno, colno, pushbk, npshbk, ln, cn)
call scnch1 (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (chr(ICHRCD) != ichar (''''))
{
while (.true.)
{
if (chr(ICHRCD) == EOF)
{
call utclit (ln, cn)
stop
}
else if (chr(ICHRCD) == ichar (''''))
{
call wrtpos (ln, cn)
write (*, '(''Unsupported multicharacter literal'')')
stop
}
call getchr (line, linno, colno, pushbk, npshbk, chr)
}
}
end
 
subroutine scnstr (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a string literal.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer tok(TOKSZ) # The output token.
 
integer ln, cn
integer chr1(CHRSZ)
integer chr2(CHRSZ)
character buf(LINESZ + 10) # Enough space, with some room to spare.
integer n0
integer i, n
 
call getchr (line, linno, colno, pushbk, npshbk, chr1)
ln = chr1(ILINNO)
cn = chr1(ICOLNO)
 
tok(ITOKNO) = TKSTR
tok(ILINNO) = ln
tok(ICOLNO) = cn
 
n0 = 1
buf(n0) = '"'
 
call getchr (line, linno, colno, pushbk, npshbk, chr1)
while (chr1(ICHRCD) != ichar ('"'))
{
# Our input method always puts a NEWLIN before EOF, and so this
# test is redundant, unless someone changes the input method.
if (chr1(ICHRCD) == EOF || chr1(ICHRCD) == NEWLIN)
{
call wrtpos (ln, cn)
write (*, '(''Unterminated string literal'')')
stop
}
if (chr1(ICHRCD) == BACKSL)
{
call getchr (line, linno, colno, pushbk, npshbk, chr2)
if (chr2(ICHRCD) == ichar ('n'))
{
n0 = n0 + 1
buf(n0) = char (BACKSL)
n0 = n0 + 1
buf(n0) = 'n'
}
else if (chr2(ICHRCD) == BACKSL)
{
n0 = n0 + 1
buf(n0) = char (BACKSL)
n0 = n0 + 1
buf(n0) = char (BACKSL)
}
else
{
call wrtpos (chr1(ILINNO), chr1(ICOLNO))
write (*, '(''Unsupported escape sequence'')')
stop
}
}
else
{
n0 = n0 + 1
buf(n0) = char (chr1(ICHRCD))
}
call getchr (line, linno, colno, pushbk, npshbk, chr1)
}
n0 = n0 + 1
buf(n0) = '"'
 
call addstr (strngs, istrng, buf, 1, n0, i, n)
tok(IARGIX) = i
tok(IARGLN) = n
end
 
subroutine unxchr (chr)
 
implicit none
 
integer chr(CHRSZ)
 
call wrtpos (chr(ILINNO), chr(ICOLNO))
write (*, 1000) char (chr(ICHRCD))
1000 format ('Unexpected character ''', A1, '''')
stop
end
 
subroutine scntok (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a token.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer tok(TOKSZ) # The output token.
 
logical isdgt
logical isid0
 
integer chr(CHRSZ)
integer chr1(CHRSZ)
integer ln, cn
 
call getchr (line, linno, colno, pushbk, npshbk, chr)
ln = chr(ILINNO)
cn = chr(ICOLNO)
tok(ILINNO) = ln
tok(ICOLNO) = cn
tok(IARGIX) = 0
tok(IARGLN) = 0
if (chr(ICHRCD) == ichar (','))
tok(ITOKNO) = TKCMMA
else if (chr(ICHRCD) == ichar (';'))
tok(ITOKNO) = TKSEMI
else if (chr(ICHRCD) == ichar ('('))
tok(ITOKNO) = TKLPAR
else if (chr(ICHRCD) == ichar (')'))
tok(ITOKNO) = TKRPAR
else if (chr(ICHRCD) == ichar ('{'))
tok(ITOKNO) = TKLBRC
else if (chr(ICHRCD) == ichar ('}'))
tok(ITOKNO) = TKRBRC
else if (chr(ICHRCD) == ichar ('*'))
tok(ITOKNO) = TKMUL
else if (chr(ICHRCD) == ichar ('/'))
tok(ITOKNO) = TKDIV
else if (chr(ICHRCD) == ichar ('%'))
tok(ITOKNO) = TKMOD
else if (chr(ICHRCD) == ichar ('+'))
tok(ITOKNO) = TKADD
else if (chr(ICHRCD) == ichar ('-'))
tok(ITOKNO) = TKSUB
else if (chr(ICHRCD) == ichar ('<'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('='))
tok(ITOKNO) = TKLE
else
{
call pshchr (pushbk, npshbk, chr1)
tok(ITOKNO) = TKLT
}
}
else if (chr(ICHRCD) == ichar ('>'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('='))
tok(ITOKNO) = TKGE
else
{
call pshchr (pushbk, npshbk, chr1)
tok(ITOKNO) = TKGT
}
}
else if (chr(ICHRCD) == ichar ('='))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('='))
tok(ITOKNO) = TKEQ
else
{
call pshchr (pushbk, npshbk, chr1)
tok(ITOKNO) = TKASGN
}
}
else if (chr(ICHRCD) == ichar ('!'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('='))
tok(ITOKNO) = TKNE
else
{
call pshchr (pushbk, npshbk, chr1)
tok(ITOKNO) = TKNOT
}
}
else if (chr(ICHRCD) == ichar ('&'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('&'))
tok(ITOKNO) = TKAND
else
call unxchr (chr)
}
else if (chr(ICHRCD) == ichar ('|'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('|'))
tok(ITOKNO) = TKOR
else
call unxchr (chr)
}
else if (chr(ICHRCD) == ichar ('"'))
{
call pshchr (pushbk, npshbk, chr)
call scnstr (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
}
else if (chr(ICHRCD) == ichar (''''))
{
call pshchr (pushbk, npshbk, chr)
call scnch (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
}
else if (isdgt (chr(ICHRCD)))
{
call pshchr (pushbk, npshbk, chr)
call scnint (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
}
else if (isid0 (chr(ICHRCD)))
{
call pshchr (pushbk, npshbk, chr)
call scnidr (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
}
else
call unxchr (chr)
end
 
subroutine scntxt (strngs, istrng, _
line, linno, colno, pushbk, npshbk)
 
# Scan the text and print the token stream.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
 
integer chr(CHRSZ)
integer tok(TOKSZ)
 
chr(ICHRCD) = ichar ('x')
while (chr(ICHRCD) != EOF)
{
call skpspc (line, linno, colno, pushbk, npshbk)
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (chr(ICHRCD) != EOF)
{
call pshchr (pushbk, npshbk, chr)
call scntok (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
call prttok (strngs, tok)
}
}
tok(ITOKNO) = TKEOI
tok(ILINNO) = chr(ILINNO)
tok(ICOLNO) = chr(ICOLNO)
tok(IARGIX) = 0
tok(IARGLN) = 0
call prttok (strngs, tok)
end
 
#---------------------------------------------------------------------
 
program lex
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
 
integer i, n
 
istrng = 1
 
# Locate "10" (newline) at 1 in the string pool.
line(1) = '1'
line(2) = '0'
call addstr (strngs, istrng, line, 1, 2, i, n)
if (i != 1 && n != 2)
{
write (*, '(''internal error'')')
stop
}
 
# Locate "92" (backslash) at 3 in the string pool.
line(1) = '9'
line(2) = '2'
call addstr (strngs, istrng, line, 1, 2, i, n)
if (i != 3 && n != 2)
{
write (*, '(''internal error'')')
stop
}
 
linno = 0
colno = LINESZ + 1 # This will trigger a READ.
npshbk = 0
 
call scntxt (strngs, istrng, line, linno, colno, pushbk, npshbk)
end
 
######################################################################</syntaxhighlight>
 
 
{{out}}
<pre>$ ratfor77 lex-in-ratfor.r > lex-in-ratfor.f && gfortran -O2 -std=legacy -fcheck=all lex-in-ratfor.f && ./a.out < compiler-tests/primes.t
4 1 Identifier count
4 7 Op_assign
4 9 Integer 1
4 10 Semicolon
5 1 Identifier n
5 3 Op_assign
5 5 Integer 1
5 6 Semicolon
6 1 Identifier limit
6 7 Op_assign
6 9 Integer 100
6 12 Semicolon
7 1 Keyword_while
7 7 LeftParen
7 8 Identifier n
7 10 Op_less
7 12 Identifier limit
7 17 RightParen
7 19 LeftBrace
8 5 Identifier k
8 6 Op_assign
8 7 Integer 3
8 8 Semicolon
9 5 Identifier p
9 6 Op_assign
9 7 Integer 1
9 8 Semicolon
10 5 Identifier n
10 6 Op_assign
10 7 Identifier n
10 8 Op_add
10 9 Integer 2
10 10 Semicolon
11 5 Keyword_while
11 11 LeftParen
11 12 LeftParen
11 13 Identifier k
11 14 Op_multiply
11 15 Identifier k
11 16 Op_lessequal
11 18 Identifier n
11 19 RightParen
11 21 Op_and
11 24 LeftParen
11 25 Identifier p
11 26 RightParen
11 27 RightParen
11 29 LeftBrace
12 9 Identifier p
12 10 Op_assign
12 11 Identifier n
12 12 Op_divide
12 13 Identifier k
12 14 Op_multiply
12 15 Identifier k
12 16 Op_notequal
12 18 Identifier n
12 19 Semicolon
13 9 Identifier k
13 10 Op_assign
13 11 Identifier k
13 12 Op_add
13 13 Integer 2
13 14 Semicolon
14 5 RightBrace
15 5 Keyword_if
15 8 LeftParen
15 9 Identifier p
15 10 RightParen
15 12 LeftBrace
16 9 Keyword_print
16 14 LeftParen
16 15 Identifier n
16 16 Comma
16 18 String " is prime\n"
16 31 RightParen
16 32 Semicolon
17 9 Identifier count
17 15 Op_assign
17 17 Identifier count
17 23 Op_add
17 25 Integer 1
17 26 Semicolon
18 5 RightBrace
19 1 RightBrace
20 1 Keyword_print
20 6 LeftParen
20 7 String "Total primes found: "
20 29 Comma
20 31 Identifier count
20 36 Comma
20 38 String "\n"
20 42 RightParen
20 43 Semicolon
21 1 End_of_input</pre>
 
=={{header|Scala}}==
Line 11,691 ⟶ 17,336:
The following code implements a configurable (from a symbol map and keyword map provided as parameters) lexical analyzer.
 
<langsyntaxhighlight lang="scala">
package xyz.hyperreal.rosettacodeCompiler
 
Line 11,952 ⟶ 17,597:
 
}
</syntaxhighlight>
</lang>
 
=={{header|Scheme}}==
 
<langsyntaxhighlight lang="scheme">
(import (scheme base)
(scheme char)
Line 12,153 ⟶ 17,798:
(display-tokens (lexer (cadr (command-line))))
(display "Error: provide program filename\n"))
</syntaxhighlight>
</lang>
 
{{out}}
Line 12,165 ⟶ 17,810:
5 1 End_of_input
</pre>
 
=={{header|Standard ML}}==
{{trans|ATS}}
{{trans|OCaml}}
 
 
<syntaxhighlight lang="sml">(*------------------------------------------------------------------*)
(* The Rosetta Code lexical analyzer, in Standard ML. Based on the ATS
and the OCaml. The intended compiler is Mlton or Poly/ML; there is
a tiny difference near the end of the file, depending on which
compiler is used. *)
 
(*------------------------------------------------------------------*)
(* The following functions are compatible with ASCII. *)
 
fun
is_digit ichar =
48 <= ichar andalso ichar <= 57
 
fun
is_lower ichar =
97 <= ichar andalso ichar <= 122
 
fun
is_upper ichar =
65 <= ichar andalso ichar <= 90
 
fun
is_alpha ichar =
is_lower ichar orelse is_upper ichar
 
fun
is_alnum ichar =
is_digit ichar orelse is_alpha ichar
 
fun
is_ident_start ichar =
is_alpha ichar orelse ichar = 95
 
fun
is_ident_continuation ichar =
is_alnum ichar orelse ichar = 95
 
fun
is_space ichar =
ichar = 32 orelse (9 <= ichar andalso ichar <= 13)
 
(*------------------------------------------------------------------*)
(* Character input more like that of C. There are various advantages
and disadvantages to this method, but key points in its favor are:
(a) it is how character input is done in the original ATS code, (b)
Unicode code points are 21-bit positive integers. *)
 
val eof = ~1
 
fun
input_ichar inpf =
case TextIO.input1 inpf of
NONE => eof
| SOME c => Char.ord c
 
(*------------------------------------------------------------------*)
 
(* The type of an input character. *)
 
structure Ch =
struct
 
type t = {
ichar : int,
line_no : int,
column_no : int
}
 
end
 
(*------------------------------------------------------------------*)
(* Inputting with unlimited pushback, and with counting of lines and
columns. *)
 
structure Inp =
struct
 
type t = {
inpf : TextIO.instream,
pushback : Ch.t list,
line_no : int,
column_no : int
}
 
fun
of_instream inpf =
{
inpf = inpf,
pushback = [],
line_no = 1,
column_no = 1
} : t
 
fun
get_ch ({ inpf = inpf,
pushback = pushback,
line_no = line_no,
column_no = column_no } : t) =
case pushback of
ch :: tail =>
let
val inp = { inpf = inpf,
pushback = tail,
line_no = line_no,
column_no = column_no }
in
(ch, inp)
end
| [] =>
let
val ichar = input_ichar inpf
val ch = { ichar = ichar,
line_no = line_no,
column_no = column_no }
in
if ichar = Char.ord #"\n" then
let
val inp = { inpf = inpf,
pushback = [],
line_no = line_no + 1,
column_no = 1 }
in
(ch, inp)
end
else
let
val inp = { inpf = inpf,
pushback = [],
line_no = line_no,
column_no = column_no + 1 }
in
(ch, inp)
end
end
 
fun
push_back_ch (ch, inp : t) =
{
inpf = #inpf inp,
pushback = ch :: #pushback inp,
line_no = #line_no inp,
column_no = #column_no inp
}
 
end
 
(*------------------------------------------------------------------*)
(* Tokens, appearing in tuples with arguments, and with line and
column numbers. The tokens are integers, so they can be used as
array indices. *)
 
val token_ELSE = 0
val token_IF = 1
val token_PRINT = 2
val token_PUTC = 3
val token_WHILE = 4
val token_MULTIPLY = 5
val token_DIVIDE = 6
val token_MOD = 7
val token_ADD = 8
val token_SUBTRACT = 9
val token_NEGATE = 10
val token_LESS = 11
val token_LESSEQUAL = 12
val token_GREATER = 13
val token_GREATEREQUAL = 14
val token_EQUAL = 15
val token_NOTEQUAL = 16
val token_NOT = 17
val token_ASSIGN = 18
val token_AND = 19
val token_OR = 20
val token_LEFTPAREN = 21
val token_RIGHTPAREN = 22
val token_LEFTBRACE = 23
val token_RIGHTBRACE = 24
val token_SEMICOLON = 25
val token_COMMA = 26
val token_IDENTIFIER = 27
val token_INTEGER = 28
val token_STRING = 29
val token_END_OF_INPUT = 30
 
(* A *very* simple perfect hash for the reserved words. (Yes, this is
overkill, except for demonstration of the principle.) *)
 
val reserved_words =
Vector.fromList ["if", "print", "else",
"", "putc", "",
"", "while", ""]
val reserved_word_tokens =
Vector.fromList [token_IF, token_PRINT, token_ELSE,
token_IDENTIFIER, token_PUTC, token_IDENTIFIER,
token_IDENTIFIER, token_WHILE, token_IDENTIFIER]
 
fun
reserved_word_lookup (s, line_no, column_no) =
if (String.size s) < 2 then
(token_IDENTIFIER, s, line_no, column_no)
else
let
val hashval =
(Char.ord (String.sub (s, 0)) +
Char.ord (String.sub (s, 1)))
mod 9
val token = Vector.sub (reserved_word_tokens, hashval)
in
if token = token_IDENTIFIER orelse
s <> Vector.sub (reserved_words, hashval) then
(token_IDENTIFIER, s, line_no, column_no)
else
(token, s, line_no, column_no)
end
 
(* Token to string lookup. *)
 
val token_names =
Vector.fromList
["Keyword_else",
"Keyword_if",
"Keyword_print",
"Keyword_putc",
"Keyword_while",
"Op_multiply",
"Op_divide",
"Op_mod",
"Op_add",
"Op_subtract",
"Op_negate",
"Op_less",
"Op_lessequal",
"Op_greater",
"Op_greaterequal",
"Op_equal",
"Op_notequal",
"Op_not",
"Op_assign",
"Op_and",
"Op_or",
"LeftParen",
"RightParen",
"LeftBrace",
"RightBrace",
"Semicolon",
"Comma",
"Identifier",
"Integer",
"String",
"End_of_input"]
 
fun
token_name token =
Vector.sub (token_names, token)
 
(*------------------------------------------------------------------*)
 
exception Unterminated_comment of int * int
exception Unterminated_character_literal of int * int
exception Multicharacter_literal of int * int
exception End_of_input_in_string_literal of int * int
exception End_of_line_in_string_literal of int * int
exception Unsupported_escape of int * int * char
exception Invalid_integer_literal of int * int * string
exception Unexpected_character of int * int * char
 
(*------------------------------------------------------------------*)
(* Skipping past spaces and comments. (In the Rosetta Code tiny
language, a comment, if you think about it, is a kind of space.) *)
 
fun
scan_comment (inp, line_no, column_no) =
let
fun
loop inp =
let
val (ch, inp) = Inp.get_ch inp
in
if #ichar ch = eof then
raise Unterminated_comment (line_no, column_no)
else if #ichar ch = Char.ord #"*" then
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = eof then
raise Unterminated_comment (line_no, column_no)
else if #ichar ch1 = Char.ord #"/" then
inp
else
loop inp
end
else
loop inp
end
in
loop inp
end
 
fun
skip_spaces_and_comments inp =
let
fun
loop inp =
let
val (ch, inp) = Inp.get_ch inp
in
if is_space (#ichar ch) then
loop inp
else if #ichar ch = Char.ord #"/" then
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"*" then
loop (scan_comment (inp, #line_no ch, #column_no ch))
else
let
val inp = Inp.push_back_ch (ch1, inp)
val inp = Inp.push_back_ch (ch, inp)
in
inp
end
end
else
Inp.push_back_ch (ch, inp)
end
in
loop inp
end
 
(*------------------------------------------------------------------*)
(* Integer literals, identifiers, and reserved words. *)
 
fun
scan_word (lst, inp) =
let
val (ch, inp) = Inp.get_ch inp
in
if is_ident_continuation (#ichar ch) then
scan_word (Char.chr (#ichar ch) :: lst, inp)
else
(lst, Inp.push_back_ch (ch, inp))
end
 
fun
scan_integer_literal inp =
let
val (ch, inp) = Inp.get_ch inp
val (lst, inp) = scan_word ([Char.chr (#ichar ch)], inp)
val s = String.implode (List.rev lst)
in
if List.all (fn c => is_digit (Char.ord c)) lst then
((token_INTEGER, s, #line_no ch, #column_no ch), inp)
else
raise Invalid_integer_literal (#line_no ch, #column_no ch, s)
end
 
fun
scan_identifier_or_reserved_word inp =
let
val (ch, inp) = Inp.get_ch inp
val (lst, inp) = scan_word ([Char.chr (#ichar ch)], inp)
val s = String.implode (List.rev lst)
val toktup = reserved_word_lookup (s, #line_no ch, #column_no ch)
in
(toktup, inp)
end
 
(*------------------------------------------------------------------*)
(* String literals. *)
 
fun
scan_string_literal inp =
let
val (ch, inp) = Inp.get_ch inp
 
fun
scan (lst, inp) =
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = eof then
raise End_of_input_in_string_literal
(#line_no ch, #column_no ch)
else if #ichar ch1 = Char.ord #"\n" then
raise End_of_line_in_string_literal
(#line_no ch, #column_no ch)
else if #ichar ch1 = Char.ord #"\"" then
(lst, inp)
else if #ichar ch1 <> Char.ord #"\\" then
scan (Char.chr (#ichar ch1) :: lst, inp)
else
let
val (ch2, inp) = Inp.get_ch inp
in
if #ichar ch2 = Char.ord #"n" then
scan (#"n" :: #"\\" :: lst, inp)
else if #ichar ch2 = Char.ord #"\\" then
scan (#"\\" :: #"\\" :: lst, inp)
else if #ichar ch2 = eof then
raise End_of_input_in_string_literal
(#line_no ch, #column_no ch)
else if #ichar ch2 = Char.ord #"\n" then
raise End_of_line_in_string_literal
(#line_no ch, #column_no ch)
else
raise Unsupported_escape (#line_no ch1, #column_no ch1,
Char.chr (#ichar ch2))
end
end
 
val lst = #"\"" :: []
val (lst, inp) = scan (lst, inp)
val lst = #"\"" :: lst
val s = String.implode (List.rev lst)
in
((token_STRING, s, #line_no ch, #column_no ch), inp)
end
 
(*------------------------------------------------------------------*)
(* Character literals. *)
 
fun
scan_character_literal_without_checking_end inp =
let
val (ch, inp) = Inp.get_ch inp
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = eof then
raise Unterminated_character_literal
(#line_no ch, #column_no ch)
else if #ichar ch1 = Char.ord #"\\" then
let
val (ch2, inp) = Inp.get_ch inp
in
if #ichar ch2 = eof then
raise Unterminated_character_literal
(#line_no ch, #column_no ch)
else if #ichar ch2 = Char.ord #"n" then
let
val s = Int.toString (Char.ord #"\n")
in
((token_INTEGER, s, #line_no ch, #column_no ch), inp)
end
else if #ichar ch2 = Char.ord #"\\" then
let
val s = Int.toString (Char.ord #"\\")
in
((token_INTEGER, s, #line_no ch, #column_no ch), inp)
end
else
raise Unsupported_escape (#line_no ch1, #column_no ch1,
Char.chr (#ichar ch2))
end
else
let
val s = Int.toString (#ichar ch1)
in
((token_INTEGER, s, #line_no ch, #column_no ch), inp)
end
end
 
fun
scan_character_literal inp =
let
val (toktup, inp) =
scan_character_literal_without_checking_end inp
val (_, _, line_no, column_no) = toktup
 
fun
check_end inp =
let
val (ch, inp) = Inp.get_ch inp
in
if #ichar ch = Char.ord #"'" then
inp
else
let
fun
loop_to_end (ch1 : Ch.t, inp) =
if #ichar ch1 = eof then
raise Unterminated_character_literal (line_no, column_no)
else if #ichar ch1 = Char.ord #"'" then
raise Multicharacter_literal (line_no, column_no)
else
let
val (ch1, inp) = Inp.get_ch inp
in
loop_to_end (ch1, inp)
end
in
loop_to_end (ch, inp)
end
end
 
val inp = check_end inp
in
(toktup, inp)
end
 
(*------------------------------------------------------------------*)
 
fun
get_next_token inp =
let
val inp = skip_spaces_and_comments inp
val (ch, inp) = Inp.get_ch inp
val ln = #line_no ch
val cn = #column_no ch
in
if #ichar ch = eof then
((token_END_OF_INPUT, "", ln, cn), inp)
else
case Char.chr (#ichar ch) of
#"," => ((token_COMMA, ",", ln, cn), inp)
| #";" => ((token_SEMICOLON, ";", ln, cn), inp)
| #"(" => ((token_LEFTPAREN, "(", ln, cn), inp)
| #")" => ((token_RIGHTPAREN, ")", ln, cn), inp)
| #"{" => ((token_LEFTBRACE, "{", ln, cn), inp)
| #"}" => ((token_RIGHTBRACE, "}", ln, cn), inp)
| #"*" => ((token_MULTIPLY, "*", ln, cn), inp)
| #"/" => ((token_DIVIDE, "/", ln, cn), inp)
| #"%" => ((token_MOD, "%", ln, cn), inp)
| #"+" => ((token_ADD, "+", ln, cn), inp)
| #"-" => ((token_SUBTRACT, "-", ln, cn), inp)
| #"<" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"=" then
((token_LESSEQUAL, "<=", ln, cn), inp)
else
let
val inp = Inp.push_back_ch (ch1, inp)
in
((token_LESS, "<", ln, cn), inp)
end
end
| #">" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"=" then
((token_GREATEREQUAL, ">=", ln, cn), inp)
else
let
val inp = Inp.push_back_ch (ch1, inp)
in
((token_GREATER, ">", ln, cn), inp)
end
end
| #"=" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"=" then
((token_EQUAL, "==", ln, cn), inp)
else
let
val inp = Inp.push_back_ch (ch1, inp)
in
((token_ASSIGN, "=", ln, cn), inp)
end
end
| #"!" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"=" then
((token_NOTEQUAL, "!=", ln, cn), inp)
else
let
val inp = Inp.push_back_ch (ch1, inp)
in
((token_NOT, "!", ln, cn), inp)
end
end
| #"&" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"&" then
((token_AND, "&&", ln, cn), inp)
else
raise Unexpected_character (#line_no ch, #column_no ch,
Char.chr (#ichar ch))
end
| #"|" =>
let
val (ch1, inp) = Inp.get_ch inp
in
if #ichar ch1 = Char.ord #"|" then
((token_OR, "||", ln, cn), inp)
else
raise Unexpected_character (#line_no ch, #column_no ch,
Char.chr (#ichar ch))
end
| #"\"" =>
let
val inp = Inp.push_back_ch (ch, inp)
in
scan_string_literal inp
end
| #"'" =>
let
val inp = Inp.push_back_ch (ch, inp)
in
scan_character_literal inp
end
| _ =>
if is_digit (#ichar ch) then
let
val inp = Inp.push_back_ch (ch, inp)
in
scan_integer_literal inp
end
else if is_ident_start (#ichar ch) then
let
val inp = Inp.push_back_ch (ch, inp)
in
scan_identifier_or_reserved_word inp
end
else
raise Unexpected_character (#line_no ch, #column_no ch,
Char.chr (#ichar ch))
end
 
fun
output_integer_rightjust (outf, num) =
(if num < 10 then
TextIO.output (outf, " ")
else if num < 100 then
TextIO.output (outf, " ")
else if num < 1000 then
TextIO.output (outf, " ")
else if num < 10000 then
TextIO.output (outf, " ")
else
();
TextIO.output (outf, Int.toString num))
 
fun
print_token (outf, toktup) =
let
val (token, arg, line_no, column_no) = toktup
val name = token_name token
val (padding, str) =
if token = token_IDENTIFIER then
(" ", arg)
else if token = token_INTEGER then
(" ", arg)
else if token = token_STRING then
(" ", arg)
else("", "")
in
output_integer_rightjust (outf, line_no);
TextIO.output (outf, " ");
output_integer_rightjust (outf, column_no);
TextIO.output (outf, " ");
TextIO.output (outf, name);
TextIO.output (outf, padding);
TextIO.output (outf, str);
TextIO.output (outf, "\n")
end
 
fun
scan_text (outf, inp) =
let
fun
loop inp =
let
val (toktup, inp) = get_next_token inp
in
(print_token (outf, toktup);
let
val (token, _, _, _) = toktup
in
if token <> token_END_OF_INPUT then
loop inp
else
()
end)
end
in
loop inp
end
 
(*------------------------------------------------------------------*)
 
fun
main () =
let
val args = CommandLine.arguments ()
val (inpf_filename, outf_filename) =
case args of
[] => ("-", "-")
| name :: [] => (name, "-")
| name1 :: name2 :: _ => (name1, name2)
val inpf =
if inpf_filename = "-" then
TextIO.stdIn
else
TextIO.openIn inpf_filename
handle
(IO.Io _) =>
(TextIO.output (TextIO.stdErr, "Failure opening \"");
TextIO.output (TextIO.stdErr, inpf_filename);
TextIO.output (TextIO.stdErr, "\" for input\n");
OS.Process.exit OS.Process.failure)
val outf =
if outf_filename = "-" then
TextIO.stdOut
else
TextIO.openOut outf_filename
handle
(IO.Io _) =>
(TextIO.output (TextIO.stdErr, "Failure opening \"");
TextIO.output (TextIO.stdErr, outf_filename);
TextIO.output (TextIO.stdErr, "\" for output\n");
OS.Process.exit OS.Process.failure)
val inp = Inp.of_instream inpf
in
scan_text (outf, inp)
end
handle Unterminated_comment (line_no, column_no) =>
(TextIO.output (TextIO.stdErr, ": unterminated comment ");
TextIO.output (TextIO.stdErr, "starting at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| Unterminated_character_literal (line_no, column_no) =>
(TextIO.output (TextIO.stdErr, ": unterminated character ");
TextIO.output (TextIO.stdErr, "literal starting at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| Multicharacter_literal (line_no, column_no) =>
(TextIO.output (TextIO.stdErr, ": unsupported multicharacter");
TextIO.output (TextIO.stdErr, " literal starting at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| End_of_input_in_string_literal (line_no, column_no) =>
(TextIO.output (TextIO.stdErr, ": end of input in string");
TextIO.output (TextIO.stdErr, " literal starting at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| End_of_line_in_string_literal (line_no, column_no) =>
(TextIO.output (TextIO.stdErr, ": end of line in string");
TextIO.output (TextIO.stdErr, " literal starting at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| Unsupported_escape (line_no, column_no, c) =>
(TextIO.output (TextIO.stdErr, CommandLine.name ());
TextIO.output (TextIO.stdErr, ": unsupported escape \\");
TextIO.output (TextIO.stdErr, Char.toString c);
TextIO.output (TextIO.stdErr, " at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| Invalid_integer_literal (line_no, column_no, str) =>
(TextIO.output (TextIO.stdErr, CommandLine.name ());
TextIO.output (TextIO.stdErr, ": invalid integer literal ");
TextIO.output (TextIO.stdErr, str);
TextIO.output (TextIO.stdErr, " at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure)
| Unexpected_character (line_no, column_no, c) =>
(TextIO.output (TextIO.stdErr, CommandLine.name ());
TextIO.output (TextIO.stdErr, ": unexpected character '");
TextIO.output (TextIO.stdErr, Char.toString c);
TextIO.output (TextIO.stdErr, "' at ");
TextIO.output (TextIO.stdErr, Int.toString line_no);
TextIO.output (TextIO.stdErr, ":");
TextIO.output (TextIO.stdErr, Int.toString column_no);
TextIO.output (TextIO.stdErr, "\n");
OS.Process.exit OS.Process.failure);
 
(*------------------------------------------------------------------*)
(* For the Mlton compiler, include the following. For Poly/ML, comment
it out. *)
main ();
 
(*------------------------------------------------------------------*)
(* Instructions for GNU Emacs. *)
 
(* local variables: *)
(* mode: sml *)
(* sml-indent-level: 2 *)
(* sml-indent-args: 2 *)
(* end: *)
(*------------------------------------------------------------------*)</syntaxhighlight>
 
 
{{out}}
For Mlton, compile with
<pre>mlton -output lex lex.sml</pre>
 
For Poly/ML, compile with
<pre>polyc -o lex lex.sml</pre>
 
Mlton is an optimizing whole-program compiler. It might take much longer to compile the source but produce much faster executables.
 
Output for testcase3:
<pre> 5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input</pre>
 
=={{header|Wren}}==
Line 12,172 ⟶ 18,676:
{{libheader|Wren-fmt}}
{{libheader|Wren-ioutil}}
<langsyntaxhighlight ecmascriptlang="wren">import "./dynamic" for Enum, Struct, Tuple
import "./str" for Char
import "./fmt" for Fmt
import "./ioutil" for FileUtil
import "os" for Process
 
Line 12,521 ⟶ 19,025:
lineCount = lines.count
initLex.call()
process.call()</langsyntaxhighlight>
 
{{out}}
Line 12,563 ⟶ 19,067:
 
=={{header|Zig}}==
<langsyntaxhighlight lang="zig">
const std = @import("std");
 
Line 12,972 ⟶ 19,476:
return result.items;
}
</syntaxhighlight>
</lang>
9,476

edits