Compiler/lexical analyzer: Difference between revisions

← Older edit

Compiler/lexical analyzer (view source)

Revision as of 15:54, 20 November 2023

199,690 bytes added , 6 months ago

m

→‎{{header|Wren}}: Minor tidy

PureFox

9,476

edits

Revision as of 21:44, 3 April 2022 (view source) Chemoelectric (talk \| contribs) m (→‎{{header\|Icon}}) ← Older edit		Latest revision as of 15:54, 20 November 2023 (view source) PureFox (talk \| contribs) m (→‎{{header\|Wren}}: Minor tidy)
(38 intermediate revisions by 5 users not shown)
Line 158: For example, the following two program fragments are equivalent, and should produce the same token stream except for the line and column positions: * <~~lang~~syntaxhighlight lang="c">if ( p /* meaning n is prime / ) { print ( n , " " ) ; count = count + 1 ; / number of primes found so far / }</~~lang~~syntaxhighlight> <~~lang~~syntaxhighlight lang="c">if(p){print(n," ");count=count+1;}</~~lang~~syntaxhighlight> ;Complete list of token names Line 237: \| style="vertical-align:top" \| Test Case 1: <~~lang~~syntaxhighlight lang="c">/* Hello world / print("Hello, World!\n");</~~lang~~syntaxhighlight> \| style="vertical-align:top" \| Line 255: \| style="vertical-align:top" \| Test Case 2: <~~lang~~syntaxhighlight lang="c">/ Show Ident and Integers / phoenix_number = 142857; print(phoenix_number, "\n");</~~lang~~syntaxhighlight> \| style="vertical-align:top" \| Line 280: \| style="vertical-align:top" \| Test Case 3: <~~lang~~syntaxhighlight lang="c">/ All lexical tokens - not syntactically correct, but that will have to wait until syntax analysis Line 301: /* character literal / '\n' / character literal / '\\' / character literal / ' '</~~lang~~syntaxhighlight> \| style="vertical-align:top" \| Line 344: \| style="vertical-align:top" \| Test Case 4: <~~lang~~syntaxhighlight lang="c">/** test printing, embedded \n and comments with lots of '' */ print(42); print("\nHello World\nGood Bye\nok\n"); print("Print a slash n - \\n.\n");</~~lang~~syntaxhighlight> \| style="vertical-align:top" \| Line 388: =={{header\|Ada}}== <~~lang~~syntaxhighlight lang="ada">with Ada.Text_IO, Ada.Streams.Stream_IO, Ada.Strings.Unbounded, Ada.Command_Line, Ada.Exceptions; use Ada.Strings, Ada.Strings.Unbounded, Ada.Streams, Ada.Exceptions; Line 648: when error : others => IO.Put_Line("Error: " & Exception_Message(error)); end Main; </syntaxhighlight> ~~</lang>~~ {{out}} Test case 3: <pre> Line 686: 23 1 END_OF_INPUT </pre> =={{header\|ALGOL 68}}== This is a simple ''token in, line out'' program. It doesn't keep an internal representation of tokens or anything like that, since that's not needed at all. As an addition, it emits a diagnostic if integer literals are too big. <syntaxhighlight lang="algol68">BEGIN # implement C-like getchar, where EOF and EOLn are "characters" (-1 and 10 resp.). # INT eof = -1, eoln = 10; BOOL eof flag := FALSE; STRING buf := ""; INT col := 1; INT line := 0; on logical file end (stand in, (REF FILE f)BOOL: eof flag := TRUE); PROC getchar = INT: IF eof flag THEN eof ELIF col = UPB buf THEN col +:= 1; eoln ELIF col > UPB buf THEN IF line > 0 THEN read(newline) FI; line +:= 1; read(buf); IF eof flag THEN col := 1; eof ELSE col := 0; getchar FI ELSE col +:= 1; ABS buf[col] FI; PROC nextchar = INT: IF eof flag THEN eof ELIF col >= UPB buf THEN eoln ELSE ABS buf[col+1] FI; PROC is blank = (INT ch) BOOL: ch = 0 OR ch = 9 OR ch = 10 OR ch = 13 OR ch = ABS " "; PROC is digit = (INT ch) BOOL: ch >= ABS "0" AND ch <= ABS "9"; PROC is ident start = (INT ch) BOOL: ch >= ABS "A" AND ch <= ABS "Z" OR ch >= ABS "a" AND ch <= ABS "z" OR ch = ABS "_"; PROC is ident = (INT ch) BOOL: is ident start(ch) OR is digit(ch); PROC ident or keyword = (INT start char) VOID: BEGIN STRING w := REPR start char; INT start col = col; WHILE is ident (next char) DO w +:= REPR getchar OD; IF w = "if" THEN output2("Keyword_if", start col) ELIF w = "else" THEN output2("Keyword_else", start col) ELIF w = "while" THEN output2("Keyword_while", start col) ELIF w = "print" THEN output2("Keyword_print", start col) ELIF w = "putc" THEN output2("Keyword_putc", start col) ELSE output2("Identifier " + w, start col) FI END; PROC char = VOID: BEGIN INT start col = col; INT ch := getchar; IF ch = ABS "'" THEN error("Empty character constant") ELIF ch = ABS "\" THEN ch := getchar; IF ch = ABS "n" THEN ch := 10 ELIF ch = ABS "\" THEN SKIP ELSE error("Unknown escape sequence. \" + REPR ch) FI FI; IF nextchar /= ABS "'" THEN error("Multi-character constant.") FI; getchar; output2("Integer " + whole(ch, 0), start col) END; PROC string = VOID: BEGIN INT start col = col; STRING s := """"; WHILE INT ch := getchar; ch /= ABS """" DO IF ch = eoln THEN error("End-of-line while scanning string literal. Closing string character not found before end-of-line.") ELIF ch = eof THEN error("End-of-file while scanning string literal. Closing string character not found.") ELIF ch = ABS "\" THEN s +:= REPR ch; ch := getchar; IF ch /= ABS "\" AND ch /= ABS "n" THEN error("Unknown escape sequence. \" + REPR ch) FI; s +:= REPR ch ELSE s +:= REPR ch FI OD; output2("String " + s + """", start col) END; PROC comment = VOID: BEGIN WHILE INT ch := getchar; NOT (ch = ABS "" AND nextchar = ABS "/") DO IF ch = eof THEN error("End-of-file in comment. Closing comment characters not found.") FI OD; getchar END; PROC number = (INT first digit) VOID: BEGIN INT start col = col; INT n := first digit - ABS "0"; WHILE is digit (nextchar) DO INT u := getchar - ABS "0"; IF LENG n * 10 + LENG u > max int THEN error("Integer too big") FI; n := n * 10 + u OD; IF is ident start (nextchar) THEN error("Invalid number. Starts like a number, but ends in non-numeric characters.") FI; output2("Integer " + whole(n, 0), start col) END; PROC output = (STRING s) VOID: output2(s, col); PROC output2 = (STRING s, INT col) VOID: print((whole(line,-8), whole(col,-8), " ", s, newline)); PROC if follows = (CHAR second, STRING longer, shorter) VOID: IF nextchar = ABS second THEN output(longer); getchar ELSE output(shorter) FI; PROC error = (STRING s)VOID: (put(stand error, ("At ", whole(line,0), ":", whole(col,0), " ", s, new line)); stop); PROC unrecognized = (INT char) VOID: error("Unrecognized character " + REPR char); PROC double char = (INT first, STRING op) VOID: IF nextchar /= first THEN unrecognized(first) ELSE output2(op, col-1); getchar FI; WHILE INT ch := getchar; ch /= eof DO IF is blank(ch) THEN SKIP ELIF ch = ABS "(" THEN output("LeftParen") ELIF ch = ABS ")" THEN output("RightParen") ELIF ch = ABS "{" THEN output("LeftBrace") ELIF ch = ABS "}" THEN output("RightBrace") ELIF ch = ABS ";" THEN output("Semicolon") ELIF ch = ABS "," THEN output("Comma") ELIF ch = ABS "" THEN output("Op_multiply") ELIF ch = ABS "/" THEN IF next char = ABS "" THEN comment ELSE output("Op_divide") FI ELIF ch = ABS "%" THEN output("Op_mod") ELIF ch = ABS "+" THEN output("Op_add") ELIF ch = ABS "-" THEN output("Op_subtract") ELIF ch = ABS "<" THEN if follows("=", "Op_lessequal", "Op_less") ELIF ch = ABS ">" THEN if follows("=", "Op_greaterequal", "Op_greater") ELIF ch = ABS "=" THEN if follows("=", "Op_equal", "Op_assign") ELIF ch = ABS "!" THEN if follows("=", "Op_notequal", "Op_not") ELIF ch = ABS "&" THEN double char(ch, "Op_and") ELIF ch = ABS "\|" THEN double char(ch, "Op_or") ELIF is ident start (ch) THEN ident or keyword (ch) ELIF ch = ABS """" THEN string ELIF ch = ABS "'" THEN char ELIF is digit(ch) THEN number(ch) ELSE unrecognized(ch) FI OD; output("End_Of_Input") END</syntaxhighlight> =={{header\|ALGOL W}}== <~~lang~~syntaxhighlight lang="algolw">begin %lexical analyser % % Algol W strings are limited to 256 characters in length so we limit source lines % Line 981 ⟶ 1,124: while nextToken not = tEnd_of_input do writeToken; writeToken end.</~~lang~~syntaxhighlight> {{out}} Test case 3: <pre> Line 1,024 ⟶ 1,167: One interesting feature of this implementation is my liberal use of a pushback buffer for input characters. This kept the code modular and easier to write. (One point of note: the C "EOF" pseudo-character is detected in the following code by looking for a negative number. That EOF has to be negative and the other characters non-negative is implied by the ISO C standard.) ~~<lang ATS>(******************************************************************)~~ <syntaxhighlight lang="ats">(*****************************************************************) ( Usage: lex [INPUTFILE [OUTPUTFILE]] If INPUTFILE or OUTPUTFILE is "-" or missing, then standard input Line 1,539 ⟶ 1,684: val cn = ch.column_no in ~~case+~~if ~~int2char0 (~~ch.ichar) of< 0 then \| ~~',' =>~~ ((~~TOKEN_COMMA~~TOKEN_END_OF_INPUT, ",", ln, cn), inp) else ~~\| ';' => ((TOKEN_SEMICOLON, ";", ln, cn), inp)~~ \| ~~'\('~~ =>case+ ~~((TOKEN_LEFTPAREN,~~int2char0 "(~~", ln, cn~~ch.ichar), ~~inp)~~of \| '),' => ((~~TOKEN_RIGHTPAREN~~TOKEN_COMMA, "),", ln, cn), inp) \| '\{;' => ((~~TOKEN_LEFTBRACE~~TOKEN_SEMICOLON, "{;", ln, cn), inp) \| '}\(' => ((~~TOKEN_RIGHTBRACE~~TOKEN_LEFTPAREN, "}(", ln, cn), inp) \| ')' => ((~~TOKEN_MULTIPLY~~TOKEN_RIGHTPAREN, ")", ln, cn), inp) \| '/\{' => ((~~TOKEN_DIVIDE~~TOKEN_LEFTBRACE, "/{", ln, cn), inp) \| '%}' => ((~~TOKEN_MOD~~TOKEN_RIGHTBRACE, "%}", ln, cn), inp) \| '+' => ((~~TOKEN_ADD~~TOKEN_MULTIPLY, "+", ln, cn), inp) \| '-/' => ((~~TOKEN_SUBTRACT~~TOKEN_DIVIDE, "-/", ln, cn), inp) \| '<%' => ((TOKEN_MOD, "%", ln, cn), inp) \| '+' => ((TOKEN_ADD, "+", ln, cn), inp) ~~let~~ \| '-' ~~val~~=> (~~ch1~~(TOKEN_SUBTRACT, ~~inp)~~"-", =ln, ~~get_ch~~cn), inp) in\| '<' => let ~~if (ch1.ichar) = char2i '=' then~~ val (~~(TOKEN_LESSEQUAL~~ch1, "<inp) =", ~~ln, cn),~~get_ch inp) ~~else~~in ~~let~~if (ch1.ichar) = char2i '=' then ~~val inp~~((TOKEN_LESSEQUAL, "<=", ~~push_back_ch~~ln, ~~(ch1~~cn), inp) inelse ~~((TOKEN_LESS, "<", ln, cn), inp)~~let ~~end~~ val inp = push_back_ch (ch1, inp) ~~end~~ in ((TOKEN_LESS, "<", ln, cn), inp) ~~\| '>' =>~~ ~~let~~ end ~~val (ch1, inp) = get_ch inp~~end in\| '>' => let ~~if (ch1.ichar) = char2i '=' then~~ val (~~(TOKEN_GREATEREQUAL~~ch1, ">inp) =", ~~ln, cn),~~get_ch inp) ~~else~~in ~~let~~if (ch1.ichar) = char2i '=' then ~~val inp~~((TOKEN_GREATEREQUAL, ">=", ~~push_back_ch~~ln, ~~(ch1~~cn), inp) inelse ~~((TOKEN_GREATER, ">", ln, cn), inp)~~let ~~end~~ val inp = push_back_ch (ch1, inp) ~~end~~ in ((TOKEN_GREATER, ">", ln, cn), inp) ~~\| '=' =>~~ ~~let~~ end ~~val (ch1, inp) = get_ch inp~~end in\| '=' => let ~~if (ch1.ichar) = char2i '=' then~~ val (~~(TOKEN_EQUAL~~ch1, "inp) =~~=",~~ ~~ln, cn),~~get_ch inp) ~~else~~in ~~let~~if (ch1.ichar) = char2i '=' then ~~val inp~~((TOKEN_EQUAL, "==", ~~push_back_ch~~ln, ~~(ch1~~cn), inp) inelse ~~((TOKEN_ASSIGN, "=", ln, cn), inp)~~let ~~end~~ val inp = push_back_ch (ch1, inp) ~~end~~ in ((TOKEN_ASSIGN, "=", ln, cn), inp) ~~\| '!' =>~~ ~~let~~ end ~~val (ch1, inp) = get_ch inp~~end in\| '!' => let ~~if (ch1.ichar) = char2i '=' then~~ val (~~(TOKEN_NOTEQUAL~~ch1, "!inp) =", ~~ln, cn),~~get_ch inp) ~~else~~in ~~let~~if (ch1.ichar) = char2i '=' then ~~val inp~~((TOKEN_NOTEQUAL, "!=", ~~push_back_ch~~ln, ~~(ch1~~cn), inp) inelse ~~((TOKEN_NOT, "!", ln, cn), inp)~~let ~~end~~ val inp = push_back_ch (ch1, inp) ~~end~~ in ((TOKEN_NOT, "!", ln, cn), inp) ~~\| '&' =>~~ ~~let~~ end ~~val (ch1, inp) = get_ch inp~~end in\| '&' => let ~~if (ch1.ichar) = char2i '&' then~~ val (~~(TOKEN_AND~~ch1, ~~"&&",~~inp) ~~ln,~~= ~~cn),~~get_ch inp) ~~else~~in ~~$raise unexpected_character~~if (chch1.~~line_no,~~ichar) ~~ch.column_no,~~= char2i '&' then ((TOKEN_AND, "&&", ln, cn), ~~ch.ichar~~inp) ~~end~~ else $raise unexpected_character (ch.line_no, ch.column_no, ~~\| '\|' =>~~ ch.ichar) ~~let~~ ~~val (ch1, inp) = get_ch inp~~end in\| '\|' => let ~~if (ch1.ichar) = char2i '\|' then~~ val (~~(TOKEN_OR~~ch1, ~~"\|\|",~~inp) ~~ln,~~= ~~cn),~~get_ch inp) ~~else~~in ~~$raise unexpected_character~~if (chch1.~~line_no,~~ichar) ~~ch.column_no,~~= char2i '\|' then ((TOKEN_OR, "\|\|", ln, cn), ~~ch.ichar~~inp) ~~end~~ else $raise unexpected_character (ch.line_no, ch.column_no, ~~\| '"' =>~~ ch.ichar) ~~let~~ end ~~val inp = push_back_ch (ch, inp)~~ in\| '"' => ~~scan_string_literal inp~~let val inp = push_back_ch (ch, inp) ~~end~~ \| ~~'\''~~ => in scan_string_literal inp ~~let~~ end ~~val inp = push_back_ch (ch, inp)~~ in\| '\'' => ~~scan_character_literal inp~~let val inp = push_back_ch (ch, inp) ~~end~~ \| _ ~~when~~ ~~isdigit~~ ~~(ch.ichar) =>~~in scan_character_literal inp ~~let~~ end ~~val inp = push_back_ch (ch, inp)~~ \| _ when isdigit (ch.ichar) => in let ~~scan_integer_literal (inp, lookups)~~ val inp = push_back_ch (ch, inp) ~~end~~ in ~~\| _ when is_ident_start (ch.ichar) =>~~ scan_integer_literal (inp, lookups) ~~let~~ end ~~val inp = push_back_ch (ch, inp)~~ \| _ when is_ident_start (ch.ichar) => in let ~~scan_identifier_or_reserved_word (inp, lookups)~~ val inp = push_back_ch (ch, inp) ~~end~~ in ~~\| _ => $raise unexpected_character (ch.line_no, ch.column_no,~~ scan_identifier_or_reserved_word (inp, lookups) ~~ch.ichar)~~ end \| _ => $raise unexpected_character (ch.line_no, ch.column_no, ch.ichar) end Line 1,702 ⟶ 1,850: lookups : !lookups_vt) : void = let val (toktup, inp) = ~~skip_spaces_and_comments~~get_next_token (inp, lookups) ~~val (ch, inp) = get_ch inp~~ ~~val ln = ch.line_no~~ ~~val cn = ch.column_no~~ in ifprint_token (~~ch.ichar)~~outf, <toktup, ~~0 then~~lookups); if toktup.0 ~~print_token~~<> ~~(outf, (~~TOKEN_END_OF_INPUT~~, "", ln,~~ ~~cn),~~then loop (inp, lookups) ~~else~~ ~~let~~ ~~val inp = push_back_ch (ch, inp)~~ ~~val (toktup, inp) = get_next_token (inp, lookups)~~ in ~~print_token (outf, toktup, lookups);~~ ~~loop (inp, lookups)~~ ~~end~~ end in Line 1,904 ⟶ 2,041: end (*******************************************************************)</~~lang~~syntaxhighlight> {{out}} Line 1,945 ⟶ 2,082: =={{header\|AWK}}== Tested with gawk 4.1.1 and mawk 1.3.4. <syntaxhighlight lang="awk"> ~~<lang AWK>~~ BEGIN { all_syms["tk_EOI" ] = "End_of_input" Line 2,151 ⟶ 2,288: } } </syntaxhighlight> ~~</lang>~~ {{out\|case=count}} <b> Line 2,188 ⟶ 2,325: =={{header\|C}}== Tested with gcc 4.81 and later, compiles warning free with -Wpedantic -pedantic -Wall -Wextra <~~lang~~syntaxhighlight Clang="c">#include <stdlib.h> #include <stdio.h> #include <stdarg.h> Line 2,420 ⟶ 2,557: run(); return 0; }</~~lang~~syntaxhighlight> {{out\|case=test case 3}} Line 2,464 ⟶ 2,601: =={{header\|C sharp\|C#}}== Requires C#6.0 because of the use of null coalescing operators. <~~lang~~syntaxhighlight lang="csharp"> using System; using System.IO; Line 2,814 ⟶ 2,951: } } </syntaxhighlight> ~~</lang>~~ {{out\|case=test case 3}} Line 2,858 ⟶ 2,995: =={{header\|C++}}== Tested with GCC 9.3.0 (g++ -std=c++17) <~~lang~~syntaxhighlight lang="cpp">#include <charconv> // std::from_chars #include <fstream> // file_to_string, string_to_file #include <functional> // std::invoke Line 3,243 ⟶ 3,380: }); } </syntaxhighlight> ~~</lang>~~ {{out\|case=test case 3}} Line 3,288 ⟶ 3,425: Using GnuCOBOL 2. By Steve Williams (with one change to get around a Rosetta Code code highlighter problem). <~~lang~~syntaxhighlight lang="cobol"> >>SOURCE FORMAT IS FREE > this code is dedicated to the public domain > (GnuCOBOL) 2.3-dev.0 Line 3,694 ⟶ 3,831: end-if . end program lexer.</~~lang~~syntaxhighlight> {{out\|case=test case 3}} Line 3,736 ⟶ 3,873: Lisp has a built-in reader and you can customize the reader by modifying its readtable. I'm also using the Gray stream, which is an almost standard feature of Common Lisp, for counting lines and columns. <~~lang~~syntaxhighlight lang="lisp">(defpackage #:lexical-analyzer (:use #:cl #:sb-gray) (:export #:main)) Line 3,949 ⟶ 4,086: (defun main () (lex standard-input))</~~lang~~syntaxhighlight> {{out\|case=test case 3}} <pre> 5 16 KEYWORD-PRINT Line 3,985 ⟶ 4,122: 22 26 INTEGER 32 23 1 END-OF-INPUT </pre> =={{header\|Elixir}}== {{works with\|Elixir\|1.13.3}} {{trans\|ATS}} <syntaxhighlight lang="elixir">#!/bin/env elixir # -- elixir -- defmodule Lex do def main args do {inpf_name, outf_name, exit_status} = case args do [] -> {"-", "-", 0} [name] -> {name, "-", 0} [name1, name2] -> {name1, name2, 0} [name1, name2 \| _] -> {name1, name2, usage_error()} end {inpf, outf, exit_status} = case {inpf_name, outf_name, exit_status} do {"-", "-", 0} -> {:stdio, :stdio, 0} {name1, "-", 0} -> {inpf, exit_status} = open_file(name1, [:read]) {inpf, :stdio, exit_status} {"-", name2, 0} -> {outf, exit_status} = open_file(name2, [:write]) {:stdio, outf, exit_status} {name1, name2, 0} -> {inpf, exit_status} = open_file(name1, [:read]) if exit_status != 0 do {inpf, name2, exit_status} else {outf, exit_status} = open_file(name2, [:write]) {inpf, outf, exit_status} end _ -> {inpf_name, outf_name, exit_status} end exit_status = case exit_status do 0 -> main_program inpf, outf _ -> exit_status end # Choose one. System.halt exit_status # Fast exit. #System.stop exit_status # Laborious cleanup. end def main_program inpf, outf do inp = make_inp inpf scan_text outf, inp exit_status = 0 exit_status end def open_file name, rw do case File.open name, rw do {:ok, f} -> {f, 0} _ -> IO.write :stderr, "Cannot open " IO.write :stderr, name case rw do [:read] -> IO.puts " for input" [:write] -> IO.puts " for output" end {name, 1} end end def scan_text outf, inp do {toktup, inp} = get_next_token inp print_token outf, toktup case toktup do {"End_of_input", _, _, _} -> :ok _ -> scan_text outf, inp end end def print_token outf, {tok, arg, line_no, column_no} do IO.write outf, (String.pad_leading "#{line_no}", 5) IO.write outf, " " IO.write outf, (String.pad_leading "#{column_no}", 5) IO.write outf, " " IO.write outf, tok case tok do "Identifier" -> IO.write outf, " " IO.write outf, arg "Integer" -> IO.write outf, " " IO.write outf, arg "String" -> IO.write outf, " " IO.write outf, arg _ -> :ok end IO.puts outf, "" end ###------------------------------------------------------------------- ### ### The token dispatcher. ### def get_next_token inp do inp = skip_spaces_and_comments inp {ch, inp} = get_ch inp {chr, line_no, column_no} = ch ln = line_no cn = column_no case chr do :eof -> {{"End_of_input", "", ln, cn}, inp} "," -> {{"Comma", ",", ln, cn}, inp} ";" -> {{"Semicolon", ";", ln, cn}, inp} "(" -> {{"LeftParen", "(", ln, cn}, inp} ")" -> {{"RightParen", ")", ln, cn}, inp} "{" -> {{"LeftBrace", "{", ln, cn}, inp} "}" -> {{"RightBrace", "}", ln, cn}, inp} "" -> {{"Op_multiply", "", ln, cn}, inp} "/" -> {{"Op_divide", "/", ln, cn}, inp} "%" -> {{"Op_mod", "%", ln, cn}, inp} "+" -> {{"Op_add", "+", ln, cn}, inp} "-" -> {{"Op_subtract", "-", ln, cn}, inp} "<" -> {ch1, inp} = get_ch inp {chr1, _, _} = ch1 case chr1 do "=" -> {{"Op_lessequal", "<=", ln, cn}, inp} _ -> {{"Op_less", "<", ln, cn}, (push_back ch1, inp)} end ">" -> {ch1, inp} = get_ch inp {chr1, _, _} = ch1 case chr1 do "=" -> {{"Op_greaterequal", ">=", ln, cn}, inp} _ -> {{"Op_greater", ">", ln, cn}, (push_back ch1, inp)} end "=" -> {ch1, inp} = get_ch inp {chr1, _, _} = ch1 case chr1 do "=" -> {{"Op_equal", "==", ln, cn}, inp} _ -> {{"Op_assign", "=", ln, cn}, (push_back ch1, inp)} end "!" -> {ch1, inp} = get_ch inp {chr1, _, _} = ch1 case chr1 do "=" -> {{"Op_notequal", "!=", ln, cn}, inp} _ -> {{"Op_not", "!", ln, cn}, (push_back ch1, inp)} end "&" -> {ch1, inp} = get_ch inp {chr1, _, _} = ch1 case chr1 do "&" -> {{"Op_and", "&&", ln, cn}, inp} _ -> unexpected_character ln, cn, chr end "\|" -> {ch1, inp} = get_ch inp {chr1, _, _} = ch1 case chr1 do "\|" -> {{"Op_or", "\|\|", ln, cn}, inp} _ -> unexpected_character ln, cn, chr end "\"" -> inp = push_back ch, inp scan_string_literal inp "'" -> inp = push_back ch, inp scan_character_literal inp _ -> cond do String.match? chr, ~r/^[[:digit:]]$/u -> inp = push_back ch, inp scan_integer_literal inp String.match? chr, ~r/^[[:alpha:]_]$/u -> inp = push_back ch, inp scan_identifier_or_reserved_word inp true -> unexpected_character ln, cn, chr end end end ###------------------------------------------------------------------- ### ### Skipping past spaces and / ... / comments. ### ### Comments are treated exactly like a bit of whitespace. They never ### make it to the dispatcher. ### def skip_spaces_and_comments inp do {ch, inp} = get_ch inp {chr, line_no, column_no} = ch cond do chr == :eof -> push_back ch, inp String.match? chr, ~r/^[[:space:]]$/u -> skip_spaces_and_comments inp chr == "/" -> {ch1, inp} = get_ch inp case ch1 do {"", _, _} -> inp = scan_comment inp, line_no, column_no skip_spaces_and_comments inp _ -> push_back ch, (push_back ch1, inp) end true -> push_back ch, inp end end def scan_comment inp, line_no, column_no do {ch, inp} = get_ch inp case ch do {:eof, _, _} -> unterminated_comment line_no, column_no {"", _, _} -> {ch1, inp} = get_ch inp case ch1 do {:eof, _, _} -> unterminated_comment line_no, column_no {"/", _, _} -> inp _ -> scan_comment inp, line_no, column_no end _ -> scan_comment inp, line_no, column_no end end ###------------------------------------------------------------------- ### ### Scanning of integer literals, identifiers, and reserved words. ### ### These three types of token are very similar to each other. ### def scan_integer_literal inp do # Scan an entire word, not just digits. This way we detect # erroneous text such as "23skidoo". {line_no, column_no, inp} = get_position inp {word, inp} = scan_word inp if String.match? word, (~r/^[[:digit:]]+$/u) do {{"Integer", word, line_no, column_no}, inp} else invalid_integer_literal line_no, column_no, word end end def scan_identifier_or_reserved_word inp do # It is assumed that the first character is of the correct type, # thanks to the dispatcher. {line_no, column_no, inp} = get_position inp {word, inp} = scan_word inp tok = case word do "if" -> "Keyword_if" "else" -> "Keyword_else" "while" -> "Keyword_while" "print" -> "Keyword_print" "putc" -> "Keyword_putc" _ -> "Identifier" end {{tok, word, line_no, column_no}, inp} end def scan_word inp, word\\"" do {ch, inp} = get_ch inp {chr, _, _} = ch if String.match? chr, (~r/^[[:alnum:]_]$/u) do scan_word inp, (word <> chr) else {word, (push_back ch, inp)} end end def get_position inp do {ch, inp} = get_ch inp {_, line_no, column_no} = ch inp = push_back ch, inp {line_no, column_no, inp} end ###------------------------------------------------------------------- ### ### Scanning of string literals. ### ### It is assumed that the first character is the opening quote, and ### that the closing quote is the same character. ### def scan_string_literal inp do {ch, inp} = get_ch inp {quote_mark, line_no, column_no} = ch {contents, inp} = scan_str_lit inp, ch {{"String", quote_mark <> contents <> quote_mark, line_no, column_no}, inp} end def scan_str_lit inp, ch, contents\\"" do {quote_mark, line_no, column_no} = ch {ch1, inp} = get_ch inp {chr1, line_no1, column_no1} = ch1 if chr1 == quote_mark do {contents, inp} else case chr1 do :eof -> eoi_in_string_literal line_no, column_no "\n" -> eoln_in_string_literal line_no, column_no "\\" -> {ch2, inp} = get_ch inp {chr2, _, _} = ch2 case chr2 do "n" -> scan_str_lit inp, ch, (contents <> "\\n") "\\" -> scan_str_lit inp, ch, (contents <> "\\\\") _ -> unsupported_escape line_no1, column_no1, chr2 end _ -> scan_str_lit inp, ch, (contents <> chr1) end end end ###------------------------------------------------------------------- ### ### Scanning of character literals. ### ### It is assumed that the first character is the opening quote, and ### that the closing quote is the same character. ### ### The tedious part of scanning a character literal is distinguishing ### between the kinds of lexical error. (One might wish to modify the ### code to detect, as a distinct kind of error, end of line within a ### character literal.) ### def scan_character_literal inp do {ch, inp} = get_ch inp {_, line_no, column_no} = ch {ch1, inp} = get_ch inp {chr1, line_no1, column_no1} = ch1 {intval, inp} = case chr1 do :eof -> unterminated_character_literal line_no, column_no "\\" -> {ch2, inp} = get_ch inp {chr2, _, _} = ch2 case chr2 do :eof -> unterminated_character_literal line_no, column_no "n" -> {(:binary.first "\n"), inp} "\\" -> {(:binary.first "\\"), inp} _ -> unsupported_escape line_no1, column_no1, chr2 end _ -> {(:binary.first chr1), inp} end inp = check_character_literal_end inp, ch {{"Integer", "#{intval}", line_no, column_no}, inp} end def check_character_literal_end inp, ch do {chr, _, _} = ch {{chr1, _, _}, inp} = get_ch inp if chr1 == chr do inp else # Lexical error. find_char_lit_end inp, ch end end def find_char_lit_end inp, ch do {chr, line_no, column_no} = ch {{chr1, _, _}, inp} = get_ch inp if chr1 == chr do multicharacter_literal line_no, column_no else case chr1 do :eof -> unterminated_character_literal line_no, column_no _ -> find_char_lit_end inp, ch end end end ###------------------------------------------------------------------- ### ### Character-at-a-time input, with unrestricted pushback, and with ### line and column numbering. ### def make_inp inpf do {inpf, [], 1, 1} end def get_ch {inpf, pushback, line_no, column_no} do case pushback do [head \| tail] -> {head, {inpf, tail, line_no, column_no}} [] -> case IO.read(inpf, 1) do :eof -> {{:eof, line_no, column_no}, {inpf, pushback, line_no, column_no}} {:error, _} -> {{:eof, line_no, column_no}, {inpf, pushback, line_no, column_no}} chr -> case chr do "\n" -> {{chr, line_no, column_no}, {inpf, pushback, line_no + 1, 1}} _ -> {{chr, line_no, column_no}, {inpf, pushback, line_no, column_no + 1}} end end end end def push_back ch, {inpf, pushback, line_no, column_no} do {inpf, [ch \| pushback], line_no, column_no} end ###------------------------------------------------------------------- ### ### Lexical and usage errors. ### def unterminated_comment line_no, column_no do raise "#{scriptname()}: unterminated comment at #{line_no}:#{column_no}" end def invalid_integer_literal line_no, column_no, word do raise "#{scriptname()}: invalid integer literal #{word} at #{line_no}:#{column_no}" end def unsupported_escape line_no, column_no, chr do raise "#{scriptname()}: unsupported escape \\#{chr} at #{line_no}:#{column_no}" end def eoi_in_string_literal line_no, column_no do raise "#{scriptname()}: end of input in string literal starting at #{line_no}:#{column_no}" end def eoln_in_string_literal line_no, column_no do raise "#{scriptname()}: end of line in string literal starting at #{line_no}:#{column_no}" end def multicharacter_literal line_no, column_no do raise "#{scriptname()}: unsupported multicharacter literal at #{line_no}:#{column_no}" end def unterminated_character_literal line_no, column_no do raise "#{scriptname()}: unterminated character literal starting at #{line_no}:#{column_no}" end def unexpected_character line_no, column_no, chr do raise "#{scriptname()}: unexpected character '#{chr}' at #{line_no}:#{column_no}" end def usage_error() do IO.puts "Usage: #{scriptname()} [INPUTFILE [OUTPUTFILE]]" IO.puts "If either of INPUTFILE or OUTPUTFILE is not present or is \"-\"," IO.puts "standard input or standard output is used, respectively." exit_status = 2 exit_status end def scriptname() do Path.basename(__ENV__.file) end #--------------------------------------------------------------------- end ## module Lex Lex.main(System.argv)</syntaxhighlight> {{out}} <pre>$ ./lex testcase3.t 5 16 Keyword_print 5 40 Op_subtract 6 16 Keyword_putc 6 40 Op_less 7 16 Keyword_if 7 40 Op_greater 8 16 Keyword_else 8 40 Op_lessequal 9 16 Keyword_while 9 40 Op_greaterequal 10 16 LeftBrace 10 40 Op_equal 11 16 RightBrace 11 40 Op_notequal 12 16 LeftParen 12 40 Op_and 13 16 RightParen 13 40 Op_or 14 16 Op_subtract 14 40 Semicolon 15 16 Op_not 15 40 Comma 16 16 Op_multiply 16 40 Op_assign 17 16 Op_divide 17 40 Integer 42 18 16 Op_mod 18 40 String "String literal" 19 16 Op_add 19 40 Identifier variable_name 20 26 Integer 10 21 26 Integer 92 22 26 Integer 32 23 1 End_of_input</pre> =={{header\|Emacs Lisp}}== {{works with\|Emacs\|GNU 27.2}} {{trans\|ATS}} <syntaxhighlight lang="lisp">#!/usr/bin/emacs --script ;; ;; The Rosetta Code lexical analyzer in GNU Emacs Lisp. ;; ;; Migrated from the ATS. However, Emacs Lisp is not friendly to the ;; functional style of the ATS implementation; therefore the ;; differences are vast. ;; ;; (A Scheme migration could easily, on the other hand, have been ;; almost exact. It is interesting to contrast Lisp dialects and see ;; how huge the differences are.) ;; ;; The script currently takes input only from standard input and ;; writes the token stream only to standard output. ;; (require 'cl-lib) ;;; The type of a character, consisting of its code point and where it ;;; occurred in the text. (cl-defstruct (ch_t (:constructor make-ch (ichar line-no column-no))) ichar line-no column-no) (defun ch-ichar (ch) (ch_t-ichar ch)) (defun ch-line-no (ch) (ch_t-line-no ch)) (defun ch-column-no (ch) (ch_t-column-no ch)) ;;; The type of an "inputter", consisting of an open file for the ;;; text, a pushback buffer (which is an indefinitely deep stack of ;;; ch_t), an input buffer for the current line, and a position in the ;;; text. (cl-defstruct inp_t file pushback line line-no column-no) (defun make-inp (file) "Initialize a new inp_t." (make-inp_t :file file :pushback '() :line "" :line-no 0 :column-no 0)) (defvar inp (make-inp t) "A global inp_t.") (defun get-ch () "Get a ch_t, either from the pushback buffer or from the input." (pcase (inp_t-pushback inp) (`(,ch . ,tail) ;; Emacs Lisp has only single value return, so the results come ;; back as a list rather than multiple values. (setq inp (make-inp_t :file (inp_t-file inp) :pushback tail :line (inp_t-line inp) :line-no (inp_t-line-no inp) :column-no (inp_t-column-no inp))) ch) ('() (let ((line (inp_t-line inp)) (line-no (inp_t-line-no inp)) (column-no (inp_t-column-no inp))) (when (string= line "") ;; Refill the buffer. (let ((text (condition-case nil (read-string "") nil (error 'eoi)))) (if (eq text 'eoi) (setq line 'eoi) (setq line (format "%s%c" text ?\n))) (setq line-no (1+ line-no)) (setq column-no 1))) (if (eq line 'eoi) (progn (setq inp (make-inp_t :file (inp_t-file inp) :pushback (inp_t-pushback inp) :line line :line-no line-no :column-no column-no)) (make-ch 'eoi line-no column-no)) (let ((c (elt line 0)) (line (substring line 1))) (setq inp (make-inp_t :file (inp_t-file inp) :pushback (inp_t-pushback inp) :line line :line-no line-no :column-no (1+ column-no))) (make-ch c line-no column-no))))))) (defun get-new-line (file) ;; Currently "file" is ignored and the input must be from stdin. (read-from-minibuffer "" :default 'eoi)) (defun push-back (ch) "Push back a ch_t." (setq inp (make-inp_t :file (inp_t-file inp) :pushback (cons ch (inp_t-pushback inp)) :line (inp_t-line inp) :line-no (inp_t-line-no inp) :column-no (inp_t-column-no inp)))) (defun get-position () "Return the line-no and column-no of the next ch_t to be returned by get-ch, assuming there are no more pushbacks beforehand." (let ((ch (get-ch)) (line-no (ch-line-no ch)) (column-no (ch-column-no ch))) (push-back ch) (list line-no column-no))) (defun scan-text (outf) "The main loop." (cl-loop for toktup = (get-next-token) do (print-token outf toktup) until (string= (elt toktup 0) "End_of_input"))) (defun print-token (outf toktup) "Print a token, along with its position and possibly an argument." ;; Currently outf is ignored, and the output goes to stdout. (pcase toktup (`(,tok ,arg ,line-no ,column-no) (princ (format "%5d %5d %s" line-no column-no tok)) (pcase tok ("Identifier" (princ (format " %s\n" arg))) ("Integer" (princ (format " %s\n" arg))) ("String" (princ (format " %s\n" arg))) (_ (princ "\n")))))) (defun get-next-token () "The token dispatcher. Returns the next token, as a list along with its argument and text position." (skip-spaces-and-comments) (let* ((ch (get-ch)) (ln (ch-line-no ch)) (cn (ch-column-no ch))) (pcase (ch-ichar ch) ('eoi (list "End_of_input" "" ln cn)) (?, (list "Comma" "," ln cn)) (?\N{SEMICOLON} (list "Semicolon" ";" ln cn)) (?\N{LEFT PARENTHESIS} (list "LeftParen" "(" ln cn)) (?\N{RIGHT PARENTHESIS} (list "RightParen" ")" ln cn)) (?{ (list "LeftBrace" "{" ln cn)) (?} (list "RightBrace" "}" ln cn)) (?* (list "Op_multiply" "" ln cn)) (?/ (list "Op_divide" "/" ln cn)) (?% (list "Op_mod" "%" ln cn)) (?+ (list "Op_add" "+" ln cn)) (?- (list "Op_subtract" "-" ln cn)) (?< (let ((ch1 (get-ch))) (pcase (ch-ichar ch1) (?= (list "Op_lessequal" "<=" ln cn)) (_ (push-back ch1) (list "Op_less" "<" ln cn))))) (?> (let ((ch1 (get-ch))) (pcase (ch-ichar ch1) (?= (list "Op_greaterequal" ">=" ln cn)) (_ (push-back ch1) (list "Op_greater" ">" ln cn))))) (?= (let ((ch1 (get-ch))) (pcase (ch-ichar ch1) (?= (list "Op_equal" "==" ln cn)) (_ (push-back ch1) (list "Op_assign" "=" ln cn))))) (?! (let ((ch1 (get-ch))) (pcase (ch-ichar ch1) (?= (list "Op_notequal" "!=" ln cn)) (_ (push-back ch1) (list "Op_not" "!" ln cn))))) (?& (let ((ch1 (get-ch))) (pcase (ch-ichar ch1) (?& (list "Op_and" "&&" ln cn)) (_ (unexpected-character ln cn (get-ichar ch)))))) (?\| (let ((ch1 (get-ch))) (pcase (ch-ichar ch1) (?\| (list "Op_or" "\|\|" ln cn)) (_ (unexpected-character ln cn (get-ichar ch)))))) (?\N{QUOTATION MARK} (push-back ch) (scan-string-literal)) (?\N{APOSTROPHE} (push-back ch) (scan-character-literal)) ((pred digitp) (push-back ch) (scan-integer-literal)) ((pred identifier-start-p) (progn (push-back ch) (scan-identifier-or-reserved-word))) (c (unexpected-character ln cn c))))) (defun skip-spaces-and-comments () "Skip spaces and comments. A comment is treated as equivalent to a run of spaces." (cl-loop for ch = (let ((ch1 (get-ch))) (pcase (ch-ichar ch1) (?/ (let ((ch2 (get-ch)) (line-no (ch-line-no ch1)) (column-no (ch-column-no ch1)) (position `(,line-no ,column-no))) (pcase (ch-ichar ch2) (?* (scan-comment position) (get-ch)) (_ (push-back ch2) ch1)))) (_ ch1))) while (spacep (ch-ichar ch)) finally do (push-back ch))) (defun scan-comment (position) (cl-loop for ch = (get-ch) for done = (comment-done-p ch position) until done)) (defun comment-done-p (ch position) (pcase (ch-ichar ch) ('eoi (apply 'unterminated-comment position)) (?* (let ((ch1 (get-ch))) (pcase (ch-ichar ch1) ('eoi (apply 'unterminated-comment position)) (?/ t) (_ nil)))) (_ nil))) (defun scan-integer-literal () "Scan an integer literal, on the assumption that a digit has been seen and pushed back." (let* ((position (get-position)) (lst (scan-word)) (s (list-to-string lst))) (if (all-digits-p lst) `("Integer" ,s . ,position) (apply 'illegal-integer-literal `(,@position , s))))) (defun scan-identifier-or-reserved-word () "Scan an identifier or reserved word, on the assumption that a legal first character (for an identifier) has been seen and pushed back." (let* ((position (get-position)) (lst (scan-word)) (s (list-to-string lst)) (tok (pcase s ("else" "Keyword_else") ("if" "Keyword_if") ("while" "Keyword_while") ("print" "Keyword_print") ("putc" "Keyword_putc") (_ "Identifier")))) `(,tok ,s . ,position))) (defun scan-word () (cl-loop for ch = (get-ch) while (identifier-continuation-p (ch-ichar ch)) collect (ch-ichar ch) finally do (push-back ch))) (defun scan-string-literal () "Scan a string literal, on the assumption that a double quote has been seen and pushed back." (let* ((ch (get-ch)) (_ (cl-assert (= (ch-ichar ch) ?\N{QUOTATION MARK}))) (line-no (ch-line-no ch)) (column-no (ch-column-no ch)) (position `(,line-no ,column-no)) (lst (scan-str-lit position)) (lst `(?\N{QUOTATION MARK} ,@lst ?\N{QUOTATION MARK}))) `("String" ,(list-to-string lst) . ,position))) (defun scan-str-lit (position) (flatten (cl-loop for ch = (get-ch) until (= (ch-ichar ch) ?\N{QUOTATION MARK}) collect (process-str-lit-character (ch-ichar ch) position)))) (defun process-str-lit-character (c position) ;; NOTE: This script might insert a newline before any eoi, so that ;; "end-of-input-in-string-literal" never actually occurs. It is a ;; peculiarity of the script's input mechanism. (pcase c ('eoi (apply 'end-of-input-in-string-literal position)) (?\n (apply 'end-of-line-in-string-literal position)) (?\\ (let ((ch1 (get-ch))) (pcase (ch-ichar ch1) (?n '(?\\ ?n)) (?\\ '(?\\ ?\\)) (c (unsupported-escape (ch-line-no ch1) (ch-column-no ch1) c))))) (c c))) (defun scan-character-literal () "Scan a character literal, on the assumption that an ASCII single quote (that is, a Unicode APOSTROPHE) has been seen and pushed back." (let* ((toktup (scan-character-literal-without-checking-end)) (line-no (elt toktup 2)) (column-no (elt toktup 3)) (position (list line-no column-no))) (check-char-lit-end position) toktup)) (defun check-char-lit-end (position) (let ((ch (get-ch))) (unless (and (integerp (ch-ichar ch)) (= (ch-ichar ch) ?\N{APOSTROPHE})) (push-back ch) (loop-to-char-lit-end position)))) (defun loop-to-char-lit-end (position) (cl-loop for ch = (get-ch) until (or (eq (ch-ichar ch) 'eoi) (= (ch-ichar ch) ?\N{APOSTROPHE})) finally do (if (eq (ch-ichar ch) 'eoi) (apply 'unterminated-character-literal position) (apply 'multicharacter-literal position)))) (defun scan-character-literal-without-checking-end () (let* ((ch (get-ch)) (_ (cl-assert (= (ch-ichar ch) ?\N{APOSTROPHE}))) (line-no (ch-line-no ch)) (column-no (ch-column-no ch)) (position (list line-no column-no)) (ch1 (get-ch))) (pcase (ch-ichar ch1) ('eoi (apply 'unterminated-character-literal position)) (?\\ (let ((ch2 (get-ch))) (pcase (ch-ichar ch2) ('eoi (apply 'unterminated-character-literal position)) (?n `("Integer" ,(format "%d" ?\n) . ,position)) (?\\ `("Integer" ,(format "%d" ?\\) . ,position)) (c (unsupported-escape (ch-line-no ch1) (ch-column-no ch1) c))))) (c `("Integer" ,(format "%d" c) . ,position))))) (defun spacep (c) (and (integerp c) (or (= c ?\N{SPACE}) (and (<= 9 c) (<= c 13))))) (defun digitp (c) (and (integerp c) (<= ?0 c) (<= c ?9))) (defun lowerp (c) ;; Warning: in EBCDIC, this kind of test for "alphabetic" is no ;; good. The letters are not contiguous. (and (integerp c) (<= ?a c) (<= c ?z))) (defun upperp (c) ;; Warning: in EBCDIC, this kind of test for "alphabetic" is no ;; good. The letters are not contiguous. (and (integerp c) (<= ?A c) (<= c ?Z))) (defun alphap (c) (or (lowerp c) (upperp c))) (defun identifier-start-p (c) (and (integerp c) (or (alphap c) (= c ?_)))) (defun identifier-continuation-p (c) (and (integerp c) (or (alphap c) (= c ?_) (digitp c)))) (defun all-digits-p (thing) (cl-loop for c in thing if (not (digitp c)) return nil finally return t)) (defun list-to-string (lst) "Convert a list of characters to a string." (apply 'string lst)) (defun flatten (lst) "Flatten nested lists. (The implementation is recursive and not for very long lists.)" (pcase lst ('() '()) (`(,head . ,tail) (if (listp head) (append (flatten head) (flatten tail)) (cons head (flatten tail)))))) (defun unexpected-character (line-no column-no c) (error (format "unexpected character '%c' at %d:%d" c line-no column-no))) (defun unsupported-escape (line-no column-no c) (error (format "unsupported escape \\%c at %d:%d" c line-no column-no))) (defun illegal-integer-literal (line-no column-no s) (error (format "illegal integer literal \"%s\" at %d:%d" s line-no column-no))) (defun unterminated-character-literal (line-no column-no) (error (format "unterminated character literal starting at %d:%d" line-no column-no))) (defun multicharacter-literal (line-no column-no) (error (format "unsupported multicharacter literal starting at %d:%d" line-no column-no))) (defun end-of-input-in-string-literal (line-no column-no) (error (format "end of input in string literal starting at %d:%d" line-no column-no))) (defun end-of-line-in-string-literal (line-no column-no) (error (format "end of line in string literal starting at %d:%d" line-no column-no))) (defun unterminated-comment (line-no column-no) (error (format "unterminated comment starting at %d:%d" line-no column-no))) (defun main () (setq inp (make-inp t)) (scan-text t)) (main)</syntaxhighlight> {{out}} <pre>$ ./lex-in-el < compiler-tests/testcase3.t 5 16 Keyword_print 5 40 Op_subtract 6 16 Keyword_putc 6 40 Op_less 7 16 Keyword_if 7 40 Op_greater 8 16 Keyword_else 8 40 Op_lessequal 9 16 Keyword_while 9 40 Op_greaterequal 10 16 LeftBrace 10 40 Op_equal 11 16 RightBrace 11 40 Op_notequal 12 16 LeftParen 12 40 Op_and 13 16 RightParen 13 40 Op_or 14 16 Op_subtract 14 40 Semicolon 15 16 Op_not 15 40 Comma 16 16 Op_multiply 16 40 Op_assign 17 16 Op_divide 17 40 Integer 42 18 16 Op_mod 18 40 String "String literal" 19 16 Op_add 19 40 Identifier variable_name 20 26 Integer 10 21 26 Integer 92 22 26 Integer 32 23 1 End_of_input</pre> =={{header\|Erlang}}== {{works with\|Erlang\|24.3.3}} {{trans\|ATS}} {{trans\|Elixir}} <syntaxhighlight lang="erlang">#!/bin/env escript %%%------------------------------------------------------------------- -record (inp_t, {inpf, pushback, line_no, column_no}). main (Args) -> main_program (Args). main_program ([]) -> scan_from_inpf_to_outf ("-", "-"), halt (0); main_program ([Inpf_filename]) -> scan_from_inpf_to_outf (Inpf_filename, "-"), halt (0); main_program ([Inpf_filename, Outf_filename]) -> scan_from_inpf_to_outf (Inpf_filename, Outf_filename), halt (0); main_program ([_, _ \| _]) -> ProgName = escript:script_name (), io:put_chars (standard_error, "Usage: "), io:put_chars (standard_error, ProgName), io:put_chars (standard_error, " [INPUTFILE [OUTPUTFILE]]\n"), halt (1). scan_from_inpf_to_outf ("-", "-") -> scan_input (standard_io, standard_io); scan_from_inpf_to_outf (Inpf_filename, "-") -> case file:open (Inpf_filename, [read]) of {ok, Inpf} -> scan_input (Inpf, standard_io); _ -> open_failure (Inpf_filename, "input") end; scan_from_inpf_to_outf ("-", Outf_filename) -> case file:open (Outf_filename, [write]) of {ok, Outf} -> scan_input (standard_io, Outf); _ -> open_failure (Outf_filename, "output") end; scan_from_inpf_to_outf (Inpf_filename, Outf_filename) -> case file:open(Inpf_filename, [read]) of {ok, Inpf} -> case file:open (Outf_filename, [write]) of {ok, Outf} -> scan_input (Inpf, Outf); _ -> open_failure (Outf_filename, "output") end; _ -> open_failure (Inpf_filename, "input") end. open_failure (Filename, ForWhat) -> ProgName = escript:script_name (), io:put_chars (standard_error, ProgName), io:put_chars (standard_error, ": failed to open \""), io:put_chars (standard_error, Filename), io:put_chars (standard_error, "\" for "), io:put_chars (standard_error, ForWhat), io:put_chars (standard_error, "\n"), halt (1). scan_input (Inpf, Outf) -> scan_text (Outf, make_inp (Inpf)). scan_text (Outf, Inp) -> {TokTup, Inp1} = get_next_token (Inp), print_token (Outf, TokTup), case TokTup of {"End_of_input", _, _, _} -> ok; _ -> scan_text (Outf, Inp1) end. print_token (Outf, {Tok, Arg, Line_no, Column_no}) -> S_line_no = erlang:integer_to_list (Line_no), S_column_no = erlang:integer_to_list (Column_no), io:put_chars (Outf, string:pad (S_line_no, 5, leading)), io:put_chars (Outf, " "), io:put_chars (Outf, string:pad (S_column_no, 5, leading)), io:put_chars (Outf, " "), io:put_chars (Outf, Tok), {Padding, Arg1} = case Tok of "Identifier" -> {" ", Arg}; "Integer" -> {" ", Arg}; "String" -> {" ", Arg}; _ -> {"", ""} end, io:put_chars (Outf, Padding), io:put_chars (Outf, Arg1), io:put_chars ("\n"). %%%------------------------------------------------------------------- %%% %%% The token dispatcher. %%% get_next_token (Inp) -> Inp00 = skip_spaces_and_comments (Inp), {Ch, Inp0} = get_ch (Inp00), {Char, Line_no, Column_no} = Ch, Ln = Line_no, Cn = Column_no, case Char of eof -> {{"End_of_input", "", Ln, Cn}, Inp0}; "," -> {{"Comma", ",", Ln, Cn}, Inp0}; ";" -> {{"Semicolon", ";", Ln, Cn}, Inp0}; "(" -> {{"LeftParen", "(", Ln, Cn}, Inp0}; ")" -> {{"RightParen", ")", Ln, Cn}, Inp0}; "{" -> {{"LeftBrace", "{", Ln, Cn}, Inp0}; "}" -> {{"RightBrace", "}", Ln, Cn}, Inp0}; "" -> {{"Op_multiply", "", Ln, Cn}, Inp0}; "/" -> {{"Op_divide", "/", Ln, Cn}, Inp0}; "%" -> {{"Op_mod", "%", Ln, Cn}, Inp0}; "+" -> {{"Op_add", "+", Ln, Cn}, Inp0}; "-" -> {{"Op_subtract", "-", Ln, Cn}, Inp0}; "<" -> {Ch1, Inp1} = get_ch (Inp0), {Char1, _, _} = Ch1, case Char1 of "=" -> {{"Op_lessequal", "<=", Ln, Cn}, Inp1}; _ -> {{"Op_less", "<", Ln, Cn}, push_back (Ch1, Inp1)} end; ">" -> {Ch1, Inp1} = get_ch (Inp0), {Char1, _, _} = Ch1, case Char1 of "=" -> {{"Op_greaterequal", ">=", Ln, Cn}, Inp1}; _ -> {{"Op_greater", ">", Ln, Cn}, push_back (Ch1, Inp1)} end; "=" -> {Ch1, Inp1} = get_ch (Inp0), {Char1, _, _} = Ch1, case Char1 of "=" -> {{"Op_equal", "==", Ln, Cn}, Inp1}; _ -> {{"Op_assign", "=", Ln, Cn}, push_back (Ch1, Inp1)} end; "!" -> {Ch1, Inp1} = get_ch (Inp0), {Char1, _, _} = Ch1, case Char1 of "=" -> {{"Op_notequal", "!=", Ln, Cn}, Inp1}; _ -> {{"Op_not", "!", Ln, Cn}, push_back (Ch1, Inp1)} end; "&" -> {Ch1, Inp1} = get_ch (Inp0), {Char1, _, _} = Ch1, case Char1 of "&" -> {{"Op_and", "&&", Ln, Cn}, Inp1}; _ -> unexpected_character (Ln, Cn, Char) end; "\|" -> {Ch1, Inp1} = get_ch (Inp0), {Char1, _, _} = Ch1, case Char1 of "\|" -> {{"Op_or", "\|\|", Ln, Cn}, Inp1}; _ -> unexpected_character (Ln, Cn, Char) end; "\"" -> Inp1 = push_back (Ch, Inp0), scan_string_literal (Inp1); "'" -> Inp1 = push_back (Ch, Inp0), scan_character_literal (Inp1); _ -> case is_digit (Char) of true -> Inp1 = push_back (Ch, Inp0), scan_integer_literal (Inp1); false -> case is_alpha_or_underscore (Char) of true -> Inp1 = push_back (Ch, Inp0), scan_identifier_or_reserved_word (Inp1); false -> unexpected_character (Ln, Cn, Char) end end end. %%%------------------------------------------------------------------- %%% %%% Skipping past spaces and /* ... / comments. %%% %%% Comments are treated exactly like a bit of whitespace. They never %%% make it to the dispatcher. %%% skip_spaces_and_comments (Inp) -> {Ch, Inp0} = get_ch (Inp), {Char, Line_no, Column_no} = Ch, case classify_char (Char) of eof -> push_back (Ch, Inp0); space -> skip_spaces_and_comments (Inp0); slash -> {Ch1, Inp1} = get_ch (Inp0), case Ch1 of {"", _, _} -> Inp2 = scan_comment (Inp1, Line_no, Column_no), skip_spaces_and_comments (Inp2); _ -> push_back (Ch, (push_back (Ch1, Inp1))) end; other -> push_back (Ch, Inp0) end. classify_char (Char) -> case Char of eof -> eof; "/" -> slash; _ -> case is_space (Char) of true -> space; false -> other end end. scan_comment (Inp, Line_no, Column_no) -> {Ch0, Inp0} = get_ch (Inp), case Ch0 of {eof, _, _} -> unterminated_comment (Line_no, Column_no); {"", _, _} -> {Ch1, Inp1} = get_ch (Inp0), case Ch1 of {eof, _, _} -> unterminated_comment (Line_no, Column_no); {"/", _, _} -> Inp1; _ -> scan_comment (Inp1, Line_no, Column_no) end; _ -> scan_comment (Inp0, Line_no, Column_no) end. is_space (S) -> case re:run (S, "^[[:space:]]+$") of {match, _} -> true; _ -> false end. %%%------------------------------------------------------------------- %%% %%% Scanning of integer literals, identifiers, and reserved words. %%% %%% These three types of token are very similar to each other. %%% scan_integer_literal (Inp) -> %% Scan an entire word, not just digits. This way we detect %% erroneous text such as "23skidoo". {Line_no, Column_no, Inp1} = get_position (Inp), {Word, Inp2} = scan_word (Inp1), case is_digit (Word) of true -> {{"Integer", Word, Line_no, Column_no}, Inp2}; false -> invalid_integer_literal (Line_no, Column_no, Word) end. scan_identifier_or_reserved_word (Inp) -> %% It is assumed that the first character is of the correct type, %% thanks to the dispatcher. {Line_no, Column_no, Inp1} = get_position (Inp), {Word, Inp2} = scan_word (Inp1), Tok = case Word of "if" -> "Keyword_if"; "else" -> "Keyword_else"; "while" -> "Keyword_while"; "print" -> "Keyword_print"; "putc" -> "Keyword_putc"; _ -> "Identifier" end, {{Tok, Word, Line_no, Column_no}, Inp2}. scan_word (Inp) -> scan_word_loop (Inp, ""). scan_word_loop (Inp, Word0) -> {Ch1, Inp1} = get_ch (Inp), {Char1, _, _} = Ch1, case is_alnum_or_underscore (Char1) of true -> scan_word_loop (Inp1, Word0 ++ Char1); false -> {Word0, push_back (Ch1, Inp1)} end. get_position (Inp) -> {Ch1, Inp1} = get_ch (Inp), {_, Line_no, Column_no} = Ch1, Inp2 = push_back (Ch1, Inp1), {Line_no, Column_no, Inp2}. is_digit (S) -> case re:run (S, "^[[:digit:]]+$") of {match, _} -> true; _ -> false end. is_alpha_or_underscore (S) -> case re:run (S, "^[[:alpha:]_]+$") of {match, _} -> true; _ -> false end. is_alnum_or_underscore (S) -> case re:run (S, "^[[:alnum:]_]+$") of {match, _} -> true; _ -> false end. %%%------------------------------------------------------------------- %%% %%% Scanning of string literals. %%% %%% It is assumed that the first character is the opening quote, and %%% that the closing quote is the same character. %%% scan_string_literal (Inp) -> {Ch1, Inp1} = get_ch (Inp), {Quote_mark, Line_no, Column_no} = Ch1, {Contents, Inp2} = scan_str_lit (Inp1, Ch1), Toktup = {"String", Quote_mark ++ Contents ++ Quote_mark, Line_no, Column_no}, {Toktup, Inp2}. scan_str_lit (Inp, Ch) -> scan_str_lit_loop (Inp, Ch, ""). scan_str_lit_loop (Inp, Ch, Contents) -> {Quote_mark, Line_no, Column_no} = Ch, {Ch1, Inp1} = get_ch (Inp), {Char1, Line_no1, Column_no1} = Ch1, case Char1 of Quote_mark -> {Contents, Inp1}; eof -> eoi_in_string_literal (Line_no, Column_no); "\n" -> eoln_in_string_literal (Line_no, Column_no); "\\" -> {Ch2, Inp2} = get_ch (Inp1), {Char2, _, _} = Ch2, case Char2 of "n" -> scan_str_lit_loop (Inp2, Ch, Contents ++ "\\n"); "\\" -> scan_str_lit_loop (Inp2, Ch, Contents ++ "\\\\"); _ -> unsupported_escape (Line_no1, Column_no1, Char2) end; _ -> scan_str_lit_loop (Inp1, Ch, Contents ++ Char1) end. %%%------------------------------------------------------------------- %%% %%% Scanning of character literals. %%% %%% It is assumed that the first character is the opening quote, and %%% that the closing quote is the same character. %%% %%% The tedious part of scanning a character literal is distinguishing %%% between the kinds of lexical error. (One might wish to modify the %%% code to detect, as a distinct kind of error, end of line within a %%% character literal.) %%% scan_character_literal (Inp) -> {Ch, Inp0} = get_ch (Inp), {_, Line_no, Column_no} = Ch, {Ch1, Inp1} = get_ch (Inp0), {Char1, Line_no1, Column_no1} = Ch1, {Intval, Inp3} = case Char1 of eof -> unterminated_character_literal (Line_no, Column_no); "\\" -> {Ch2, Inp2} = get_ch (Inp1), {Char2, _, _} = Ch2, case Char2 of eof -> unterminated_character_literal (Line_no, Column_no); "n" -> {char_to_code ("\n"), Inp2}; "\\" -> {char_to_code ("\\"), Inp2}; _ -> unsupported_escape (Line_no1, Column_no1, Char2) end; _ -> {char_to_code (Char1), Inp1} end, Inp4 = check_character_literal_end (Inp3, Ch), {{"Integer", Intval, Line_no, Column_no}, Inp4}. char_to_code (Char) -> %% Hat tip to https://archive.ph/BxZRS lists:flatmap (fun erlang:integer_to_list/1, Char). check_character_literal_end (Inp, Ch) -> {Char, _, _} = Ch, {{Char1, _, _}, Inp1} = get_ch (Inp), case Char1 of Char -> Inp1; _ -> find_char_lit_end (Inp1, Ch) % Handle a lexical error. end. find_char_lit_end (Inp, Ch) -> %% There is a lexical error. Determine which kind it fits into. {Char, Line_no, Column_no} = Ch, {{Char1, _, _}, Inp1} = get_ch (Inp), case Char1 of Char -> multicharacter_literal (Line_no, Column_no); eof -> unterminated_character_literal (Line_no, Column_no); _ -> find_char_lit_end (Inp1, Ch) end. %%%------------------------------------------------------------------- %%% %%% Character-at-a-time input, with unrestricted pushback, and with %%% line and column numbering. %%% make_inp (Inpf) -> #inp_t{inpf = Inpf, pushback = [], line_no = 1, column_no = 1}. get_ch (Inp) -> #inp_t{inpf = Inpf, pushback = Pushback, line_no = Line_no, column_no = Column_no} = Inp, case Pushback of [Ch \| Tail] -> Inp1 = Inp#inp_t{pushback = Tail}, {Ch, Inp1}; [] -> case io:get_chars (Inpf, "", 1) of eof -> Ch = {eof, Line_no, Column_no}, {Ch, Inp}; {error, _} -> Ch = {eof, Line_no, Column_no}, {Ch, Inp}; Char -> case Char of "\n" -> Ch = {Char, Line_no, Column_no}, Inp1 = Inp#inp_t{line_no = Line_no + 1, column_no = 1}, {Ch, Inp1}; _ -> Ch = {Char, Line_no, Column_no}, Inp1 = Inp#inp_t{column_no = Column_no + 1}, {Ch, Inp1} end end end. push_back (Ch, Inp) -> Inp#inp_t{pushback = [Ch \| Inp#inp_t.pushback]}. %%%------------------------------------------------------------------- invalid_integer_literal (Line_no, Column_no, Word) -> error_abort ("invalid integer literal \"" ++ Word ++ "\" at " ++ integer_to_list (Line_no) ++ ":" ++ integer_to_list (Column_no)). unsupported_escape (Line_no, Column_no, Char) -> error_abort ("unsupported escape \\" ++ Char ++ " at " ++ integer_to_list (Line_no) ++ ":" ++ integer_to_list (Column_no)). unexpected_character (Line_no, Column_no, Char) -> error_abort ("unexpected character '" ++ Char ++ "' at " ++ integer_to_list (Line_no) ++ ":" ++ integer_to_list (Column_no)). eoi_in_string_literal (Line_no, Column_no) -> error_abort ("end of input in string literal starting at " ++ integer_to_list (Line_no) ++ ":" ++ integer_to_list (Column_no)). eoln_in_string_literal (Line_no, Column_no) -> error_abort ("end of line in string literal starting at " ++ integer_to_list (Line_no) ++ ":" ++ integer_to_list (Column_no)). unterminated_character_literal (Line_no, Column_no) -> error_abort ("unterminated character literal starting at " ++ integer_to_list (Line_no) ++ ":" ++ integer_to_list (Column_no)). multicharacter_literal (Line_no, Column_no) -> error_abort ("unsupported multicharacter literal starting at " ++ integer_to_list (Line_no) ++ ":" ++ integer_to_list (Column_no)). unterminated_comment (Line_no, Column_no) -> error_abort ("unterminated comment starting at " ++ integer_to_list (Line_no) ++ ":" ++ integer_to_list (Column_no)). error_abort (Message) -> ProgName = escript:script_name (), io:put_chars (standard_error, ProgName), io:put_chars (standard_error, ": "), io:put_chars (standard_error, Message), io:put_chars (standard_error, "\n"), halt (1). %%%------------------------------------------------------------------- %%% Instructions to GNU Emacs -- %%% local variables: %%% mode: erlang %%% erlang-indent-level: 3 %%% end: %%%-------------------------------------------------------------------</syntaxhighlight> {{out}} <pre>$ ./lex-in-Erlang compiler-tests/testcase3.t 5 16 Keyword_print 5 40 Op_subtract 6 16 Keyword_putc 6 40 Op_less 7 16 Keyword_if 7 40 Op_greater 8 16 Keyword_else 8 40 Op_lessequal 9 16 Keyword_while 9 40 Op_greaterequal 10 16 LeftBrace 10 40 Op_equal 11 16 RightBrace 11 40 Op_notequal 12 16 LeftParen 12 40 Op_and 13 16 RightParen 13 40 Op_or 14 16 Op_subtract 14 40 Semicolon 15 16 Op_not 15 40 Comma 16 16 Op_multiply 16 40 Op_assign 17 16 Op_divide 17 40 Integer 42 18 16 Op_mod 18 40 String "String literal" 19 16 Op_add 19 40 Identifier variable_name 20 26 Integer 10 21 26 Integer 92 22 26 Integer 32 23 1 End_of_input</pre> =={{header\|Euphoria}}== Tested with Euphoria 4.05. <~~lang~~syntaxhighlight lang="euphoria">include std/io.e include std/map.e include std/types.e Line 4,213 ⟶ 5,877: end procedure main(command_line())</~~lang~~syntaxhighlight> {{out\|case=test case 3}} Line 4,257 ⟶ 5,921: =={{header\|Flex}}== Tested with Flex 2.5.4. <syntaxhighlight lang="c">%{ ~~<lang C>%{~~ #include <stdio.h> #include <stdlib.h> Line 4,430 ⟶ 6,094: } while (tok != tk_EOI); return 0; }</~~lang~~syntaxhighlight> {{out\|case=test case 3}} Line 4,474 ⟶ 6,138: =={{header\|Forth}}== Tested with Gforth 0.7.3. <~~lang~~syntaxhighlight ~~Forth~~lang="forth">CREATE BUF 0 , \ single-character look-ahead buffer CREATE COLUMN# 0 , CREATE LINE# 1 , Line 4,596 ⟶ 6,260: THEN THEN ; : TOKENIZE BEGIN CONSUME AGAIN ; TOKENIZE</~~lang~~syntaxhighlight> {{out}} Line 4,610 ⟶ 6,274: The author has placed this Fortran code in the public domain. <syntaxhighlight lang="fortran">!!! ~~<lang Fortran>!!!~~ !!! An implementation of the Rosetta Code lexical analyzer task: !!! https://rosettacode.org/wiki/Compiler/lexical_analyzer Line 5,688 ⟶ 7,352: end subroutine print_usage end program lex</~~lang~~syntaxhighlight> {{out}} Line 5,729 ⟶ 7,393: =={{header\|FreeBASIC}}== Tested with FreeBASIC 1.05 <~~lang~~syntaxhighlight ~~FreeBASIC~~lang="freebasic">enum Token_type tk_EOI tk_Mul Line 6,015 ⟶ 7,679: print : print "Hit any to end program" sleep system</~~lang~~syntaxhighlight> {{out\|case=test case 3}} <b> Line 6,056 ⟶ 7,720: =={{header\|Go}}== {{trans\|FreeBASIC}} <~~lang~~syntaxhighlight lang="go">package main import ( Line 6,433 ⟶ 8,097: initLex() process() }</~~lang~~syntaxhighlight> {{out}} Line 6,476 ⟶ 8,140: =={{header\|Haskell}}== Tested with GHC 8.0.2 <~~lang~~syntaxhighlight lang="haskell">import Control.Applicative hiding (many, some) import Control.Monad.State.Lazy import Control.Monad.Trans.Maybe (MaybeT, runMaybeT) Line 6,780 ⟶ 8,444: where (Just t, s') = runState (runMaybeT lexer) s (txt, _, _) = s' </syntaxhighlight> ~~</lang>~~ {{out\|case=test case 3}} Line 6,832 ⟶ 8,496: Global variables are avoided except for some constants that require initialization. <syntaxhighlight lang="icon"># ~~<lang Icon>#~~ # The Rosetta Code lexical analyzer in Icon with co-expressions. Based # upon the ATS implementation. Line 6,891 ⟶ 8,555: inpf := &input outf := &output if 1 <= args & args[1] ~== "-" then ~~inpf := open(args[1], "rt")~~{ if 2 ~~<= args & args[2] ~== "-" then outf~~inpf := open(args[21], "wtrt") \| stop("cannot open ", args[1], " for input") } if 2 <= args & args[2] ~== "-" then { outf := open(args[2], "wt") \| stop("cannot open ", args[2], " for output") } pushback_buffer := [] Line 7,061 ⟶ 8,731: toktup := scan_character_literal_without_checking_end(inp) ch1 := @inp ~~while EOF ~=== ch1[1] &~~if ch1[1] ~=== close_quote dothen { ~~case ch1[1] of~~repeat { case ch1[1] of { ~~EOF : unterminated_character_literal(line_no, column_no)~~ ~~close_quote~~ EOF : ~~multicharacter_literal~~unterminated_character_literal(line_no, column_no) close_quote : multicharacter_literal(line_no, column_no) ~~default : ch1 := @inp~~ default : ch1 := @inp } } } Line 7,151 ⟶ 8,823: if ch[1] == "" then { ch1 := @inp (EOF === chch1[1]) & unterminated_comment(line_no, column_no) } } Line 7,322 ⟶ 8,994: procedure max(x, y) return (if x < y then y else x) end</~~lang~~syntaxhighlight> Line 7,371 ⟶ 9,043: Implementation: <~~lang~~syntaxhighlight Jlang="j">symbols=:256#0 ch=: {{1 0+x[symbols=: x (a.i.y)} symbols}} 'T0 token' =: 0 ch '%+-!(){};,<>=!\|&' Line 7,491 ⟶ 9,163: keep=. (tokens~:<,'''')-.comments+.whitespace+.unknowna:=values keep&#each ((1+lines),.columns);<names,.values }}</~~lang~~syntaxhighlight> Test case 3: <syntaxhighlight lang="j"> ~~<lang J>~~ flex=: {{ 'A B'=.y Line 7,561 ⟶ 9,233: 21 28 Integer 92 22 27 Integer 32 23 1 End_of_input </~~lang~~syntaxhighlight> Here, it seems expedient to retain a structured representation of the lexical result. As shown, it's straightforward to produce a "pure" textual result for a hypothetical alternative implementation of the syntax analyzer, but the structured representation will be easier to deal with. =={{header\|Java}}== <~~lang~~syntaxhighlight lang="java"> // Translated from python source Line 7,716 ⟶ 9,388: if (text.equals("")) { error(line, pos, String.format("identifer_or_integer ~~unrecopgnized~~unrecognized character: (%d) %c", (int)this.chr, this.chr)); } if (Character.isDigit(text.charAt(0))) { if (!is_number) { error(line, pos, String.format("~~invaslid~~invalid number: %s", text)); } return new Token(TokenType.Integer, text, line, pos); Line 7,807 ⟶ 9,479: } } </syntaxhighlight> ~~</lang>~~ =={{header\|JavaScript}}== {{incorrect\|Javascript\|Please show output. Code is identical to [[Compiler/syntax_analyzer]] task}} <~~lang~~syntaxhighlight lang="javascript"> / Token: type, value, line, pos Line 8,024 ⟶ 9,696: l.printTokens() }) </syntaxhighlight> ~~</lang>~~ =={{header\|Julia}}== <~~lang~~syntaxhighlight lang="julia">struct Tokenized startline::Int startcol::Int Line 8,182 ⟶ 9,854: println(lpad(tok.startline, 3), lpad(tok.startcol, 5), lpad(tok.name, 18), " ", tok.value != nothing ? tok.value : "") end </~~lang~~syntaxhighlight>{{output}}<pre> Line Col Name Value 5 16 Keyword_print Line 8,219 ⟶ 9,891: 23 1 End_of_input </pre> =={{header\|kotlin}}== {{trans\|Java}} <syntaxhighlight lang="kotlin">// Input: command line argument of file to process or console input. A two or // three character console input of digits followed by a new line will be // checked for an integer between zero and twenty-five to select a fixed test // case to run. Any other console input will be parsed. // Code based on the Java version found here: // https://rosettacode.org/mw/index.php?title=Compiler/lexical_analyzer&action=edit&section=22 // Class to halt the parsing with an exception. class ParsingFailed(message: String): Exception(message) // Enumerate class of tokens supported by this scanner. enum class TokenType { Tk_End_of_input, Op_multiply, Op_divide, Op_mod, Op_add, Op_subtract, Op_negate, Op_not, Op_less, Op_lessequal, Op_greater, Op_greaterequal, Op_equal, Op_notequal, Op_assign, Op_and, Op_or, Kw_if, Kw_else, Kw_while, Kw_print, Kw_putc, Sy_LeftParen, Sy_RightParen, Sy_LeftBrace, Sy_RightBrace, Sy_Semicolon, Sy_Comma, Tk_Identifier, Tk_Integer, Tk_String; override fun toString() = listOf("End_of_input", "Op_multiply", "Op_divide", "Op_mod", "Op_add", "Op_subtract", "Op_negate", "Op_not", "Op_less", "Op_lessequal", "Op_greater", "Op_greaterequal", "Op_equal", "Op_notequal", "Op_assign", "Op_and", "Op_or", "Keyword_if", "Keyword_else", "Keyword_while", "Keyword_print", "Keyword_putc", "LeftParen", "RightParen", "LeftBrace", "RightBrace", "Semicolon", "Comma", "Identifier", "Integer", "String")[this.ordinal] } // TokenType // Data class of tokens returned by the scanner. data class Token(val token: TokenType, val value: String, val line: Int, val pos: Int) { // Overridden method to display the token. override fun toString() = "%5d %5d %-15s %s".format(line, pos, this.token, when (this.token) { TokenType.Tk_Integer, TokenType.Tk_Identifier -> " %s".format(this.value) TokenType.Tk_String -> this.value.toList().joinToString("", " \"", "\"") { when (it) { '\t' -> "\\t" '\n' -> "\\n" '\u000b' -> "\\v" '\u000c' -> "\\f" '\r' -> "\\r" '"' -> "\\\"" '\\' -> "\\" in ' '..'~' -> "$it" else -> "\\u%04x".format(it.code) } } else -> "" } ) } // Token // Function to display an error message and halt the scanner. fun error(line: Int, pos: Int, msg: String): Nothing = throw ParsingFailed("(%d, %d) %s\n".format(line, pos, msg)) // Class to process the source into tokens with properties of the // source string, the line number, the column position, the index // within the source string, the current character being processed, // and map of the keyword strings to the corresponding token type. class Lexer(private val s: String) { private var line = 1 private var pos = 1 private var position = 0 private var chr = if (s.isEmpty()) ' ' else s[0] private val keywords = mapOf<String, TokenType>( "if" to TokenType.Kw_if, "else" to TokenType.Kw_else, "print" to TokenType.Kw_print, "putc" to TokenType.Kw_putc, "while" to TokenType.Kw_while) // Method to retrive the next character from the source. Use null after // the end of our source. private fun getNextChar() = if (++this.position >= this.s.length) { this.pos++ this.chr = '\u0000' this.chr } else { this.pos++ this.chr = this.s[this.position] when (this.chr) { '\n' -> { this.line++ this.pos = 0 } // line '\t' -> while (this.pos%8 != 1) this.pos++ } // when this.chr } // if // Method to return the division token, skip the comment, or handle the // error. private fun div_or_comment(line: Int, pos: Int): Token = if (getNextChar() != '') Token(TokenType.Op_divide, "", line, pos); else { getNextChar() // Skip comment start outer@ while (true) when (this.chr) { '\u0000' -> error(line, pos, "Lexer: EOF in comment"); '' -> if (getNextChar() == '/') { getNextChar() // Skip comment end break@outer } // if else -> getNextChar() } // when getToken() } // if // Method to verify a character literal. Return the token or handle the // error. private fun char_lit(line: Int, pos: Int): Token { var c = getNextChar() // skip opening quote when (c) { '\'' -> error(line, pos, "Lexer: Empty character constant"); '\\' -> c = when (getNextChar()) { 'n' -> 10.toChar() '\\' -> '\\' '\'' -> '\'' else -> error(line, pos, "Lexer: Unknown escape sequence '\\%c'". format(this.chr)) } } // when if (getNextChar() != '\'') error(line, pos, "Lexer: Multi-character constant") getNextChar() // Skip closing quote return Token(TokenType.Tk_Integer, c.code.toString(), line, pos) } // char_lit // Method to check next character to see whether it belongs to the token // we might be in the middle of. Return the correct token or handle the // error. private fun follow(expect: Char, ifyes: TokenType, ifno: TokenType, line: Int, pos: Int): Token = when { getNextChar() == expect -> { getNextChar() Token(ifyes, "", line, pos) } // matches ifno == TokenType.Tk_End_of_input -> error(line, pos, "Lexer: %c expected: (%d) '%c'".format(expect, this.chr.code, this.chr)) else -> Token(ifno, "", line, pos) } // when // Method to verify a character string. Return the token or handle the // error. private fun string_lit(start: Char, line: Int, pos: Int): Token { var result = "" while (getNextChar() != start) when (this.chr) { '\u0000' -> error(line, pos, "Lexer: EOF while scanning string literal") '\n' -> error(line, pos, "Lexer: EOL while scanning string literal") '\\' -> when (getNextChar()) { '\\' -> result += '\\' 'n' -> result += '\n' '"' -> result += '"' else -> error(line, pos, "Lexer: Escape sequence unknown '\\%c'". format(this.chr)) } // when else -> result += this.chr } // when getNextChar() // Toss closing quote return Token(TokenType.Tk_String, result, line, pos) } // string_lit // Method to retrive an identifier or integer. Return the keyword // token, if the string matches one. Return the integer token, // if the string is all digits. Return the identifer token, if the // string is valid. Otherwise, handle the error. private fun identifier_or_integer(line: Int, pos: Int): Token { var is_number = true var text = "" while (this.chr in listOf('_')+('0'..'9')+('a'..'z')+('A'..'Z')) { text += this.chr is_number = is_number && this.chr in '0'..'9' getNextChar() } // while if (text.isEmpty()) error(line, pos, "Lexer: Unrecognized character: (%d) %c". format(this.chr.code, this.chr)) return when { text[0] in '0'..'9' -> if (!is_number) error(line, pos, "Lexer: Invalid number: %s". format(text)) else { val max = Int.MAX_VALUE.toString() if (text.length > max.length \|\| (text.length == max.length && max < text)) error(line, pos, "Lexer: Number exceeds maximum value %s". format(text)) Token(TokenType.Tk_Integer, text, line, pos) } // if this.keywords.containsKey(text) -> Token(this.keywords[text]!!, "", line, pos) else -> Token(TokenType.Tk_Identifier, text, line, pos) } } // identifier_or_integer // Method to skip whitespace both C's and Unicode ones and retrive the next // token. private fun getToken(): Token { while (this.chr in listOf('\t', '\n', '\u000b', '\u000c', '\r', ' ') \|\| this.chr.isWhitespace()) getNextChar() val line = this.line val pos = this.pos return when (this.chr) { '\u0000' -> Token(TokenType.Tk_End_of_input, "", line, pos) '/' -> div_or_comment(line, pos) '\'' -> char_lit(line, pos) '<' -> follow('=', TokenType.Op_lessequal, TokenType.Op_less, line, pos) '>' -> follow('=', TokenType.Op_greaterequal, TokenType.Op_greater, line, pos) '=' -> follow('=', TokenType.Op_equal, TokenType.Op_assign, line, pos) '!' -> follow('=', TokenType.Op_notequal, TokenType.Op_not, line, pos) '&' -> follow('&', TokenType.Op_and, TokenType.Tk_End_of_input, line, pos) '\|' -> follow('\|', TokenType.Op_or, TokenType.Tk_End_of_input, line, pos) '"' -> string_lit(this.chr, line, pos) '{' -> { getNextChar() Token(TokenType.Sy_LeftBrace, "", line, pos) } // open brace '}' -> { getNextChar() Token(TokenType.Sy_RightBrace, "", line, pos) } // close brace '(' -> { getNextChar() Token(TokenType.Sy_LeftParen, "", line, pos) } // open paren ')' -> { getNextChar() Token(TokenType.Sy_RightParen, "", line, pos) } // close paren '+' -> { getNextChar() Token(TokenType.Op_add, "", line, pos) } // plus '-' -> { getNextChar() Token(TokenType.Op_subtract, "", line, pos) } // dash '' -> { getNextChar() Token(TokenType.Op_multiply, "", line, pos) } // asterisk '%' -> { getNextChar() Token(TokenType.Op_mod, "", line, pos) } // percent ';' -> { getNextChar() Token(TokenType.Sy_Semicolon, "", line, pos) } // semicolon ',' -> { getNextChar() Token(TokenType.Sy_Comma, "", line, pos) } // comma else -> identifier_or_integer(line, pos) } } // getToken // Method to parse and display tokens. fun printTokens() { do { val t: Token = getToken() println(t) } while (t.token != TokenType.Tk_End_of_input) } // printTokens } // Lexer // Function to test all good tests from the website and produce all of the // error messages this program supports. fun tests(number: Int) { // Function to generate test case 0 source: Hello World/Text. fun hello() { Lexer( """/ Hello world / print("Hello, World!\n"); """).printTokens() } // hello // Function to generate test case 1 source: Phoenix Number. fun phoenix() { Lexer( """/ Show Ident and Integers / phoenix_number = 142857; print(phoenix_number, "\n");""").printTokens() } // phoenix // Function to generate test case 2 source: All Symbols. fun symbols() { Lexer( """/ All lexical tokens - not syntactically correct, but that will have to wait until syntax analysis / / Print / print / Sub / - / Putc / putc / Lss / < / If / if / Gtr / > / Else / else / Leq / <= / While / while / Geq / >= / Lbrace / { / Eq / == / Rbrace / } / Neq / != / Lparen / ( / And / && / Rparen / ) / Or / \|\| / Uminus / - / Semi / ; / Not / ! / Comma / , / Mul / /* Assign / = / Div / / / Integer / 42 / Mod / % / String / "String literal" / Add / + / Ident / variable_name / character literal / '\n' / character literal / '\\' / character literal / ' '""").printTokens() } // symbols // Function to generate test case 3 source: Test Case 4. fun four() { Lexer( """/** test printing, embedded \n and comments with lots of '' */ print(42); print("\nHello World\nGood Bye\nok\n"); print("Print a slash n - \\n.\n");""").printTokens() } // four // Function to generate test case 4 source: Count. fun count() { Lexer( """count = 1; while (count < 10) { print("count is: ", count, "\n"); count = count + 1; }""").printTokens() } // count // Function to generate test case 5 source: 100 Doors. fun doors() { Lexer( """/ 100 Doors / i = 1; while (i i <= 100) { print("door ", i * i, " is open\n"); i = i + 1; }""").printTokens() } // doors // Function to generate test case 6 source: Negative Tests. fun negative() { Lexer( """a = (-1 * ((-1 * (5 * 15)) / 10)); print(a, "\n"); b = -a; print(b, "\n"); print(-b, "\n"); print(-(1), "\n");""").printTokens() } // negative // Function to generate test case 7 source: Deep. fun deep() { Lexer( """print(---------------------------------+++5, "\n"); print(((((((((3 + 2) * ((((((2))))))))))))), "\n"); if (1) { if (1) { if (1) { if (1) { if (1) { print(15, "\n"); } } } } }""").printTokens() } // deep // Function to generate test case 8 source: Greatest Common Divisor. fun gcd() { Lexer( """/* Compute the gcd of 1071, 1029: 21 / a = 1071; b = 1029; while (b != 0) { new_a = b; b = a % b; a = new_a; } print(a);""").printTokens() } // gcd // Function to generate test case 9 source: Factorial. fun factorial() { Lexer( """/ 12 factorial is 479001600 / n = 12; result = 1; i = 1; while (i <= n) { result = result i; i = i + 1; } print(result);""").printTokens() } // factorial // Function to generate test case 10 source: Fibonacci Sequence. fun fibonacci() { Lexer( """/* fibonacci of 44 is 701408733 / n = 44; i = 1; a = 0; b = 1; while (i < n) { w = a + b; a = b; b = w; i = i + 1; } print(w, "\n");""").printTokens() } // fibonacci // Function to generate test case 11 source: FizzBuzz. fun fizzbuzz() { Lexer( """/ FizzBuzz / i = 1; while (i <= 100) { if (!(i % 15)) print("FizzBuzz"); else if (!(i % 3)) print("Fizz"); else if (!(i % 5)) print("Buzz"); else print(i); print("\n"); i = i + 1; }""").printTokens() } // fizzbuzz // Function to generate test case 12 source: 99 Bottles of Beer. fun bottles() { Lexer( """/ 99 bottles / bottles = 99; while (bottles > 0) { print(bottles, " bottles of beer on the wall\n"); print(bottles, " bottles of beer\n"); print("Take one down, pass it around\n"); bottles = bottles - 1; print(bottles, " bottles of beer on the wall\n\n"); }""").printTokens() } // bottles // Function to generate test case 13 source: Primes. fun primes() { Lexer( """/ Simple prime number generator / count = 1; n = 1; limit = 100; while (n < limit) { k=3; p=1; n=n+2; while ((kk<=n) && (p)) { p=n/kk!=n; k=k+2; } if (p) { print(n, " is prime\n"); count = count + 1; } } print("Total primes found: ", count, "\n");""").printTokens() } // primes // Function to generate test case 14 source: Ascii Mandelbrot. fun ascii() { Lexer( """{ / This is an integer ascii Mandelbrot generator / left_edge = -420; right_edge = 300; top_edge = 300; bottom_edge = -300; x_step = 7; y_step = 15; max_iter = 200; y0 = top_edge; while (y0 > bottom_edge) { x0 = left_edge; while (x0 < right_edge) { y = 0; x = 0; the_char = ' '; i = 0; while (i < max_iter) { x_x = (x x) / 200; y_y = (y * y) / 200; if (x_x + y_y > 800 ) { the_char = '0' + i; if (i > 9) { the_char = '@'; } i = max_iter; } y = x * y / 100 + y0; x = x_x - y_y + x0; i = i + 1; } putc(the_char); x0 = x0 + x_step; } putc('\n'); y0 = y0 - y_step; } } """).printTokens() } // ascii when (number) { 0 -> hello() 1 -> phoenix() 2 -> symbols() 3 -> four() 4 -> count() 5 -> doors() 6 -> negative() 7 -> deep() 8 -> gcd() 9 -> factorial() 10 -> fibonacci() 11 -> fizzbuzz() 12 -> bottles() 13 -> primes() 14 -> ascii() 15 -> // Lexer: Empty character constant Lexer("''").printTokens() 16 -> // Lexer: Unknown escape sequence Lexer("'\\x").printTokens() 17 -> // Lexer: Multi-character constant Lexer("' ").printTokens() 18 -> // Lexer: EOF in comment Lexer("/").printTokens() 19 -> // Lexer: EOL in string Lexer("\"\n").printTokens() 20 -> // Lexer: EOF in string Lexer("\"").printTokens() 21 -> // Lexer: Escape sequence unknown Lexer("\"\\x").printTokens() 22 -> // Lexer: Unrecognized character Lexer("~").printTokens() 23 -> // Lexer: invalid number Lexer("9a9").printTokens() 24 -> // Lexer: Number exceeds maximum value Lexer("2147483648\n9223372036854775808").printTokens() 25 -> // Lexer: Operator expected Lexer("\|.").printTokens() else -> println("Invalid test number %d!".format(number)) } // when } // tests // Main function to check our source and read its data before parsing it. // With no source specified, run the test of all symbols. fun main(args: Array<String>) { try { val s = if (args.size > 0 && args[0].isNotEmpty()) // file on command line java.util.Scanner(java.io.File(args[0])) else // use the console java.util.Scanner(System.`in`) var source = "" while (s.hasNext()) source += s.nextLine()+ if (s.hasNext()) "\n" else "" if (args.size > 0 && args[0].isNotEmpty()) // file on command line Lexer(source).printTokens() else { val digits = source.filter { it in '0'..'9' } when { source.isEmpty() -> // nothing given tests(2) source.length in 1..2 && digits.length == source.length && digits.toInt() in 0..25 -> tests(digits.toInt()) else -> Lexer(source).printTokens() } // when } // if } catch(e: Throwable) { println(e.message) System.exit(1) } // try } // main</syntaxhighlight> {{out\|case=test case 3: All Symbols}} <b> <pre> 5 16 Keyword_print 5 40 Op_subtract 6 16 Keyword_putc 6 40 Op_less 7 16 Keyword_if 7 40 Op_greater 8 16 Keyword_else 8 40 Op_lessequal 9 16 Keyword_while 9 40 Op_greaterequal 10 16 LeftBrace 10 40 Op_equal 11 16 RightBrace 11 40 Op_notequal 12 16 LeftParen 12 40 Op_and 13 16 RightParen 13 40 Op_or 14 16 Op_subtract 14 40 Semicolon 15 16 Op_not 15 40 Comma 16 16 Op_multiply 16 40 Op_assign 17 16 Op_divide 17 40 Integer 42 18 16 Op_mod 18 40 String "String literal" 19 16 Op_add 19 40 Identifier variable_name 20 26 Integer 10 21 26 Integer 92 22 26 Integer 32 22 29 End_of_input </pre> </b> =={{header\|Lua}}== Line 8,227 ⟶ 10,614: The first module is simply a table defining the names of tokens which don't have an associated value. <~~lang~~syntaxhighlight ~~Lua~~lang="lua">-- module token_name (in a file "token_name.lua") local token_name = { [''] = 'Op_multiply', Line 8,256 ⟶ 10,643: ['putc'] = 'Keyword_putc', } return token_name</~~lang~~syntaxhighlight> This module exports a function <i>find_token</i>, which attempts to find the next valid token from a specified position in a source line. <~~lang~~syntaxhighlight ~~Lua~~lang="lua">-- module lpeg_token_finder local M = {} -- only items added to M will be public (via 'return M' at end) local table, concat = table, table.concat Line 8,342 ⟶ 10,729: end return M</~~lang~~syntaxhighlight> The <i>lexer</i> module uses <i>finder.find_token</i> to produce an iterator over the tokens in a source. <~~lang~~syntaxhighlight ~~Lua~~lang="lua">-- module lexer local M = {} -- only items added to M will publicly available (via 'return M' at end) local string, io, coroutine, yield = string, io, coroutine, coroutine.yield Line 8,424 ⟶ 10,811: -- M._INTERNALS = _ENV return M </syntaxhighlight> ~~</lang>~~ This script uses <i>lexer.tokenize_text</i> to show the token sequence produced from a source text. <~~lang~~syntaxhighlight ~~Lua~~lang="lua">lexer = require 'lexer' format, gsub = string.format, string.gsub Line 8,466 ⟶ 10,853: -- etc. end </syntaxhighlight> ~~</lang>~~ ===Using only standard libraries=== This version replaces the <i>lpeg_token_finder</i> module of the LPeg version with this <i>basic_token_finder</i> module, altering the <i>require</i> expression near the top of the <i>lexer</i> module accordingly. Tested with Lua 5.3.5. (Note that <i>select</i> is a standard function as of Lua 5.2.) <~~lang~~syntaxhighlight lang="lua">-- module basic_token_finder local M = {} -- only items added to M will be public (via 'return M' at end) local table, string = table, string Line 8,601 ⟶ 10,988: -- M._ENV = _ENV return M</~~lang~~syntaxhighlight> =={{header\|M2000 Interpreter}}== <syntaxhighlight lang="m2000 interpreter"> ~~<lang M2000 Interpreter>~~ Module lexical_analyzer { a$={/* Line 8,860 ⟶ 11,247: } lexical_analyzer </syntaxhighlight> ~~</lang>~~ {{out}} Line 8,899 ⟶ 11,286: </pre > =={{header\|Mercury}}== {{trans\|ATS}} {{works with\|Mercury\|20.06.1}} <syntaxhighlight lang="mercury">% -- mercury -- % % Compile with maybe something like: % mmc -O4 --intermod-opt -E --make --warn-non-tail-recursion self-and-mutual lex % :- module lex. :- interface. :- import_module io. :- pred main(io::di, io::uo) is det. :- implementation. :- import_module char. :- import_module exception. :- import_module int. :- import_module list. :- import_module stack. :- import_module string. :- type token_t ---> token_ELSE ; token_IF ; token_PRINT ; token_PUTC ; token_WHILE ; token_MULTIPLY ; token_DIVIDE ; token_MOD ; token_ADD ; token_SUBTRACT ; token_NEGATE ; token_LESS ; token_LESSEQUAL ; token_GREATER ; token_GREATEREQUAL ; token_EQUAL ; token_NOTEQUAL ; token_NOT ; token_ASSIGN ; token_AND ; token_OR ; token_LEFTPAREN ; token_RIGHTPAREN ; token_LEFTBRACE ; token_RIGHTBRACE ; token_SEMICOLON ; token_COMMA ; token_IDENTIFIER ; token_INTEGER ; token_STRING ; token_END_OF_INPUT. :- type ch_t % The type of a fetched character. ---> {int, % A character or `eof', stored as an int. int, % The line number. int}. % The column number. :- type inp_t % The `inputter' type. Fetches one character. ---> inp_t(inpf :: text_input_stream, line_no :: int, column_no :: int, pushback :: stack(ch_t)). :- type toktup_t % The type of a scanned token with its argument. ---> {token_t, % The token kind. string, % An argument. (May or may not be meaningful.) int, % The starting line number. int}. % The starting column number. main(!IO) :- command_line_arguments(Args, !IO), ( if (Args = []) then (InpF_filename = "-", OutF_filename = "-", main_program(InpF_filename, OutF_filename, !IO)) else if (Args = [F1]) then (InpF_filename = F1, OutF_filename = "-", main_program(InpF_filename, OutF_filename, !IO)) else if (Args = [F1, F2]) then (InpF_filename = F1, OutF_filename = F2, main_program(InpF_filename, OutF_filename, !IO)) else usage_error(!IO) ). :- pred main_program(string::in, string::in, io::di, io::uo) is det. main_program(InpF_filename, OutF_filename, !IO) :- open_InpF(InpF, InpF_filename, !IO), open_OutF(OutF, OutF_filename, !IO), init(InpF, Inp0), scan_text(OutF, Inp0, _, !IO). :- pred open_InpF(text_input_stream::out, string::in, io::di, io::uo) is det. open_InpF(InpF, InpF_filename, !IO) :- if (InpF_filename = "-") then (InpF = io.stdin_stream) else ( open_input(InpF_filename, InpF_result, !IO), ( if (InpF_result = ok(F)) then (InpF = F) else throw("Error: cannot open " ++ InpF_filename ++ " for input") ) ). :- pred open_OutF(text_output_stream::out, string::in, io::di, io::uo) is det. open_OutF(OutF, OutF_filename, !IO) :- if (OutF_filename = "-") then (OutF = io.stdout_stream) else ( open_output(OutF_filename, OutF_result, !IO), ( if (OutF_result = ok(F)) then (OutF = F) else throw("Error: cannot open " ++ OutF_filename ++ " for output") ) ). :- pred usage_error(io::di, io::uo) is det. usage_error(!IO) :- progname("lex", ProgName, !IO), (io.format("Usage: %s [INPUT_FILE [OUTPUT_FILE]]\n", [s(ProgName)], !IO)), (io.write_string("If INPUT_FILE is \"-\" or not present then standard input is used.\n", !IO)), (io.write_string("If OUTPUT_FILE is \"-\" or not present then standard output is used.\n", !IO)), set_exit_status(1, !IO). :- pred scan_text(text_output_stream::in, inp_t::in, inp_t::out, io::di, io::uo) is det. scan_text(OutF, !Inp, !IO) :- get_next_token(TokTup, !Inp, !IO), print_token(TokTup, OutF, !IO), {Tok, _, _, _} = TokTup, ( if (Tok = token_END_OF_INPUT) then true else scan_text(OutF, !Inp, !IO) ). :- pred print_token(toktup_t::in, text_output_stream::in, io::di, io::uo) is det. print_token(TokTup, OutF, !IO) :- {Tok, Arg, Line_no, Column_no} = TokTup, token_name(Tok) = TokName, (io.format(OutF, "%5d %5d %s", [i(Line_no), i(Column_no), s(TokName)], !IO)), ( if (Tok = token_IDENTIFIER) then (io.format(OutF, " %s", [s(Arg)], !IO)) else if (Tok = token_INTEGER) then (io.format(OutF, " %s", [s(Arg)], !IO)) else if (Tok = token_STRING) then (io.format(OutF, " %s", [s(Arg)], !IO)) else true ), (io.format(OutF, "\n", [], !IO)). :- func token_name(token_t) = string is det. :- pred token_name(token_t::in, string::out) is det. token_name(Tok) = Str :- token_name(Tok, Str). token_name(token_ELSE, "Keyword_else"). token_name(token_IF, "Keyword_if"). token_name(token_PRINT, "Keyword_print"). token_name(token_PUTC, "Keyword_putc"). token_name(token_WHILE, "Keyword_while"). token_name(token_MULTIPLY, "Op_multiply"). token_name(token_DIVIDE, "Op_divide"). token_name(token_MOD, "Op_mod"). token_name(token_ADD, "Op_add"). token_name(token_SUBTRACT, "Op_subtract"). token_name(token_NEGATE, "Op_negate"). token_name(token_LESS, "Op_less"). token_name(token_LESSEQUAL, "Op_lessequal"). token_name(token_GREATER, "Op_greater"). token_name(token_GREATEREQUAL, "Op_greaterequal"). token_name(token_EQUAL, "Op_equal"). token_name(token_NOTEQUAL, "Op_notequal"). token_name(token_NOT, "Op_not"). token_name(token_ASSIGN, "Op_assign"). token_name(token_AND, "Op_and"). token_name(token_OR, "Op_or"). token_name(token_LEFTPAREN, "LeftParen"). token_name(token_RIGHTPAREN, "RightParen"). token_name(token_LEFTBRACE, "LeftBrace"). token_name(token_RIGHTBRACE, "RightBrace"). token_name(token_SEMICOLON, "Semicolon"). token_name(token_COMMA, "Comma"). token_name(token_IDENTIFIER, "Identifier"). token_name(token_INTEGER, "Integer"). token_name(token_STRING, "String"). token_name(token_END_OF_INPUT, "End_of_input"). :- pred get_next_token(toktup_t::out, inp_t::in, inp_t::out, io::di, io::uo) is det. get_next_token(TokTup, !Inp, !IO) :- skip_spaces_and_comments(!Inp, !IO), get_ch(Ch, !Inp, !IO), {IChar, Line_no, Column_no} = Ch, LN = Line_no, CN = Column_no, ( if (IChar = eof) then ( TokTup = {token_END_OF_INPUT, "", LN, CN} ) else ( Char = det_from_int(IChar), ( if (Char = (',')) then (TokTup = {token_COMMA, ",", LN, CN}) else if (Char = (';')) then (TokTup = {token_SEMICOLON, ";", LN, CN}) else if (Char = ('(')) then (TokTup = {token_LEFTPAREN, "(", LN, CN}) else if (Char = (')')) then (TokTup = {token_RIGHTPAREN, ")", LN, CN}) else if (Char = ('{')) then (TokTup = {token_LEFTBRACE, "{", LN, CN}) else if (Char = ('}')) then (TokTup = {token_RIGHTBRACE, "}", LN, CN}) else if (Char = ('')) then (TokTup = {token_MULTIPLY, "", LN, CN}) else if (Char = ('/')) then (TokTup = {token_DIVIDE, "/", LN, CN}) else if (Char = ('%')) then (TokTup = {token_MOD, "%", LN, CN}) else if (Char = ('+')) then (TokTup = {token_ADD, "+", LN, CN}) else if (Char = ('-')) then (TokTup = {token_SUBTRACT, "-", LN, CN}) else if (Char = ('<')) then ( get_ch(Ch1, !Inp, !IO), Ch1 = {IChar1, _, _}, ( if (IChar1 = to_int('=')) then ( TokTup = {token_LESSEQUAL, "<=", LN, CN} ) else ( push_back(Ch1, !Inp), TokTup = {token_LESS, "<", LN, CN} ) ) ) else if (Char = ('>')) then ( get_ch(Ch1, !Inp, !IO), Ch1 = {IChar1, _, _}, ( if (IChar1 = to_int('=')) then ( TokTup = {token_GREATEREQUAL, ">=", LN, CN} ) else ( push_back(Ch1, !Inp), TokTup = {token_GREATER, ">", LN, CN} ) ) ) else if (Char = ('=')) then ( get_ch(Ch1, !Inp, !IO), Ch1 = {IChar1, _, _}, ( if (IChar1 = to_int('=')) then ( TokTup = {token_EQUAL, "==", LN, CN} ) else ( push_back(Ch1, !Inp), TokTup = {token_ASSIGN, "=", LN, CN} ) ) ) else if (Char = ('!')) then ( get_ch(Ch1, !Inp, !IO), Ch1 = {IChar1, _, _}, ( if (IChar1 = to_int('=')) then ( TokTup = {token_NOTEQUAL, "!=", LN, CN} ) else ( push_back(Ch1, !Inp), TokTup = {token_NOT, "!", LN, CN} ) ) ) else if (Char = ('&')) then ( get_ch(Ch1, !Inp, !IO), Ch1 = {IChar1, _, _}, ( if (IChar1 = to_int('&')) then ( TokTup = {token_AND, "&&", LN, CN} ) else throw("Error: unexpected character '" ++ from_char(Char) ++ "' at " ++ from_int(LN) ++ ":" ++ from_int(CN)) ) ) else if (Char = ('\|')) then ( get_ch(Ch1, !Inp, !IO), Ch1 = {IChar1, _, _}, ( if (IChar1 = to_int('\|')) then ( TokTup = {token_OR, "\|\|", LN, CN} ) else throw("Error: unexpected character '" ++ from_char(Char) ++ "' at " ++ from_int(LN) ++ ":" ++ from_int(CN)) ) ) else if (Char = ('"')) then ( push_back(Ch, !Inp), scan_string_literal(TokTup, !Inp, !IO) ) else if (Char = ('\'')) then ( push_back(Ch, !Inp), scan_character_literal(TokTup, !Inp, !IO) ) else if (is_alpha(Char)) then ( push_back(Ch, !Inp), scan_identifier_or_reserved_word( TokTup, !Inp, !IO) ) else if (is_digit(Char)) then ( push_back(Ch, !Inp), scan_integer_literal(TokTup, !Inp, !IO) ) else ( throw("Error: unexpected character '" ++ from_char(Char) ++ "' at " ++ from_int(LN) ++ ":" ++ from_int(CN)) ) ) ) ). :- pred skip_spaces_and_comments(inp_t::in, inp_t::out, io::di, io::uo) is det. skip_spaces_and_comments(!Inp, !IO) :- get_ch(Ch, !Inp, !IO), Ch = {IChar, _, _}, ( if (IChar = eof) then push_back(Ch, !Inp) else if (is_whitespace(det_from_int(IChar))) then skip_spaces_and_comments(!Inp, !IO) else if (IChar = to_int('/')) then ( get_ch(Ch1, !Inp, !IO), Ch1 = {IChar1, Line_no, Column_no}, ( if (IChar1 = to_int('')) then ( scan_comment(Line_no, Column_no, !Inp, !IO), skip_spaces_and_comments(!Inp, !IO) ) else ( push_back(Ch1, !Inp), push_back(Ch, !Inp) ) ) ) else push_back(Ch, !Inp) ). :- pred scan_comment(int::in, int::in, % line and column nos. inp_t::in, inp_t::out, io::di, io::uo) is det. scan_comment(Line_no, Column_no, !Inp, !IO) :- get_ch(Ch, !Inp, !IO), {IChar, _, _} = Ch, ( if (IChar = eof) then throw("Error: unterminated comment " ++ "starting at " ++ from_int(Line_no) ++ ":" ++ from_int(Column_no)) else ( det_from_int(IChar) = Char, ( if (Char = ('')) then ( get_ch(Ch1, !Inp, !IO), {IChar1, _, _} = Ch1, ( if (IChar1 = to_int('/')) then true % End of comment has been reached. else ( push_back(Ch1, !Inp), scan_comment(Line_no, Column_no, !Inp, !IO) ) ) ) else scan_comment(Line_no, Column_no, !Inp, !IO) ) ) ). :- pred scan_character_literal(toktup_t::out, inp_t::in, inp_t::out, io::di, io::uo) is det. scan_character_literal(TokTup, !Inp, !IO) :- get_ch(Ch, !Inp, !IO), Ch = {OpenQuote, Line_no, Column_no}, CloseQuote = OpenQuote, scan_char_lit_contents(CodePoint, Line_no, Column_no, !Inp, !IO), check_char_lit_end(CloseQuote, Line_no, Column_no, !Inp, !IO), Arg = from_int(CodePoint), TokTup = {token_INTEGER, Arg, Line_no, Column_no}. :- pred scan_char_lit_contents(int::out, int::in, int::in, inp_t::in, inp_t::out, io::di, io::uo) is det. scan_char_lit_contents(CodePoint, Line_no, Column_no, !Inp, !IO) :- get_ch(Ch1, !Inp, !IO), Ch1 = {IChar1, Line_no1, Column_no1}, ( if (IChar1 = eof) then throw("Error: end of input in character literal " ++ "starting at " ++ from_int(Line_no) ++ ":" ++ from_int(Column_no)) else if (IChar1 = to_int('\\')) then ( get_ch(Ch2, !Inp, !IO), Ch2 = {IChar2, _, _}, (if (IChar2 = eof) then throw("Error: end of input in character literal " ++ "starting at " ++ from_int(Line_no) ++ ":" ++ from_int(Column_no)) else if (IChar2 = to_int('n')) then (CodePoint = to_int('\n')) else if (IChar2 = to_int('\\')) then (CodePoint = to_int('\\')) else throw("Error: unsupported escape \\" ++ from_char(det_from_int(IChar2)) ++ " at " ++ from_int(Line_no1) ++ ":" ++ from_int(Column_no1)) ) ) else (CodePoint = IChar1) ). :- pred check_char_lit_end(int::in, int::in, int::in, inp_t::in, inp_t::out, io::di, io::uo) is det. check_char_lit_end(CloseQuote, Line_no, Column_no, !Inp, !IO) :- get_ch(Ch1, !Inp, !IO), Ch1 = {IChar1, _, _}, ( if (IChar1 = CloseQuote) then true else find_bad_char_lit_end(CloseQuote, Line_no, Column_no, !Inp, !IO) ). :- pred find_bad_char_lit_end(int::in, int::in, int::in, inp_t::in, inp_t::out, io::di, io::uo) is det. find_bad_char_lit_end(CloseQuote, Line_no, Column_no, !Inp, !IO) :- get_ch(Ch2, !Inp, !IO), Ch2 = {IChar2, _, _}, ( if (IChar2 = CloseQuote) then throw("Error: unsupported multicharacter literal " ++ " at " ++ from_int(Line_no) ++ ":" ++ from_int(Column_no)) else if (IChar2 = eof) then throw("Error: end of input in character literal " ++ " at " ++ from_int(Line_no) ++ ":" ++ from_int(Column_no)) else find_bad_char_lit_end(CloseQuote, Line_no, Column_no, !Inp, !IO) ). :- pred scan_string_literal(toktup_t::out, inp_t::in, inp_t::out, io::di, io::uo) is det. scan_string_literal(TokTup, !Inp, !IO) :- get_ch(Ch, !Inp, !IO), Ch = {OpenQuote, Line_no, Column_no}, CloseQuote = OpenQuote, scan_string_lit_contents("", Str, CloseQuote, Line_no, Column_no, !Inp, !IO), Arg = from_char(det_from_int(OpenQuote)) ++ Str ++ from_char(det_from_int(CloseQuote)), TokTup = {token_STRING, Arg, Line_no, Column_no}. :- pred scan_string_lit_contents(string::in, string::out, int::in, int::in, int::in, inp_t::in, inp_t::out, io::di, io::uo) is det. scan_string_lit_contents(Str0, Str, CloseQuote, Line_no, Column_no, !Inp, !IO) :- get_ch(Ch1, !Inp, !IO), Ch1 = {IChar1, Line_no1, Column_no1}, ( if (IChar1 = CloseQuote) then (Str = Str0) else if (IChar1 = eof) then throw("Error: end of input in string literal " ++ "starting at " ++ from_int(Line_no) ++ ":" ++ from_int(Column_no)) else if (IChar1 = to_int('\n')) then throw("Error: end of line in string literal " ++ "starting at " ++ from_int(Line_no) ++ ":" ++ from_int(Column_no)) else if (IChar1 = to_int('\\')) then ( get_ch(Ch2, !Inp, !IO), Ch2 = {IChar2, _, _}, ( if (IChar2 = to_int('n')) then ( Str1 = Str0 ++ "\\n", scan_string_lit_contents(Str1, Str, CloseQuote, Line_no, Column_no, !Inp, !IO) ) else if (IChar2 = to_int('\\')) then ( Str1 = Str0 ++ "\\\\", scan_string_lit_contents(Str1, Str, CloseQuote, Line_no, Column_no, !Inp, !IO) ) else if (IChar2 = eof) then throw("Error: end of input in string literal " ++ "starting at " ++ from_int(Line_no) ++ ":" ++ from_int(Column_no)) else if (IChar2 = to_int('\n')) then throw("Error: end of line in string literal " ++ "starting at " ++ from_int(Line_no) ++ ":" ++ from_int(Column_no)) else throw("Error: unsupported escape \\" ++ from_char(det_from_int(IChar2)) ++ " at " ++ from_int(Line_no1) ++ ":" ++ from_int(Column_no1)) ) ) else ( Char1 = det_from_int(IChar1), Str1 = Str0 ++ from_char(Char1), scan_string_lit_contents(Str1, Str, CloseQuote, Line_no, Column_no, !Inp, !IO) ) ). :- pred scan_identifier_or_reserved_word(toktup_t::out, inp_t::in, inp_t::out, io::di, io::uo) is det. scan_identifier_or_reserved_word(TokTup, !Inp, !IO) :- scan_integer_or_word(Str, Line_no, Column_no, !Inp, !IO), ( if (Str = "if") then (TokTup = {token_IF, Str, Line_no, Column_no}) else if (Str = "else") then (TokTup = {token_ELSE, Str, Line_no, Column_no}) else if (Str = "while") then (TokTup = {token_WHILE, Str, Line_no, Column_no}) else if (Str = "print") then (TokTup = {token_PRINT, Str, Line_no, Column_no}) else if (Str = "putc") then (TokTup = {token_PUTC, Str, Line_no, Column_no}) else (TokTup = {token_IDENTIFIER, Str, Line_no, Column_no}) ). :- pred scan_integer_literal(toktup_t::out, inp_t::in, inp_t::out, io::di, io::uo) is det. scan_integer_literal(TokTup, !Inp, !IO) :- scan_integer_or_word(Str, Line_no, Column_no, !Inp, !IO), ( if (not is_all_digits(Str)) then throw("Error: not a valid integer literal: " ++ Str) else (TokTup = {token_INTEGER, Str, Line_no, Column_no}) ). :- pred scan_integer_or_word(string::out, int::out, int::out, inp_t::in, inp_t::out, io::di, io::uo) is det. scan_integer_or_word(Str, Line_no, Column_no, !Inp, !IO) :- get_ch({IChar, Line_no, Column_no}, !Inp, !IO), ( if (IChar = eof) then throw("internal error") else ( Char = det_from_int(IChar), (if (not is_alnum_or_underscore(Char)) then throw("internal error") else scan_int_or_word(from_char(Char), Str, !Inp, !IO)) ) ). :- pred scan_int_or_word(string::in, string::out, inp_t::in, inp_t::out, io::di, io::uo) is det. scan_int_or_word(Str0, Str, !Inp, !IO) :- get_ch(CharTup, !Inp, !IO), {IChar, _, _} = CharTup, ( if (IChar = eof) then ( push_back(CharTup, !Inp), Str = Str0 ) else ( Char = det_from_int(IChar), ( if (not is_alnum_or_underscore(Char)) then ( push_back(CharTup, !Inp), Str = Str0 ) else scan_int_or_word(Str0 ++ from_char(Char), Str, !Inp, !IO) ) ) ). :- pred init(text_input_stream::in, inp_t::out) is det. init(Inpf, Inp) :- Inp = inp_t(Inpf, 1, 1, init). :- pred get_ch(ch_t::out, inp_t::in, inp_t::out, io::di, io::uo) is det. get_ch(Ch, Inp0, Inp, !IO) :- if (pop(Ch1, Inp0^pushback, Pushback)) then ( Ch = Ch1, Inp = (Inp0^pushback := Pushback) ) else ( inp_t(Inpf, Line_no, Column_no, Pushback) = Inp0, read_char_unboxed(Inpf, Result, Char, !IO), ( if (Result = ok) then ( Ch = {to_int(Char), Line_no, Column_no}, Inp = (if (Char = ('\n')) then inp_t(Inpf, Line_no + 1, 1, Pushback) else inp_t(Inpf, Line_no, Column_no + 1, Pushback)) ) else ( Ch = {eof, Line_no, Column_no}, Inp = Inp0 ) ) ). :- pred push_back(ch_t::in, inp_t::in, inp_t::out) is det. push_back(Ch, Inp0, Inp) :- Inp = (Inp0^pushback := push(Inp0^pushback, Ch)). :- func eof = int is det. eof = -1.</syntaxhighlight> {{out}} <pre>$ mmc -O6 --intermod-opt -E --make --warn-non-tail-recursion self-and-mutual lex && ./lex compiler-tests/testcase3.t Making Mercury/int3s/lex.int3 Making Mercury/ints/lex.int Making Mercury/opts/lex.opt Making Mercury/cs/lex.c Making Mercury/os/lex.o Making lex 5 16 Keyword_print 5 40 Op_subtract 6 16 Keyword_putc 6 40 Op_less 7 16 Keyword_if 7 40 Op_greater 8 16 Keyword_else 8 40 Op_lessequal 9 16 Keyword_while 9 40 Op_greaterequal 10 16 LeftBrace 10 40 Op_equal 11 16 RightBrace 11 40 Op_notequal 12 16 LeftParen 12 40 Op_and 13 16 RightParen 13 40 Op_or 14 16 Op_subtract 14 40 Semicolon 15 16 Op_not 15 40 Comma 16 16 Op_multiply 16 40 Op_assign 17 16 Op_divide 17 40 Integer 42 18 16 Op_mod 18 40 String "String literal" 19 16 Op_add 19 40 Identifier variable_name 20 26 Integer 10 21 26 Integer 92 22 26 Integer 32 23 1 End_of_input</pre> =={{header\|Nim}}== Tested with Nim v0.19.4. Both examples are tested against all programs in [[Compiler/Sample programs]]. ===Using string with regular expressions=== <~~lang~~syntaxhighlight lang="nim"> import re, strformat, strutils Line 9,095 ⟶ 12,263: echo input.tokenize.output </syntaxhighlight> ~~</lang>~~ ===Using stream with lexer library=== <~~lang~~syntaxhighlight lang="nim"> import lexbase, streams from strutils import Whitespace Line 9,408 ⟶ 12,576: echo &"({l.lineNumber},{l.getColNumber l.bufpos + 1}) {l.error}" main() </syntaxhighlight> ~~</lang>~~ ===Using nothing but system and strutils=== <~~lang~~syntaxhighlight lang="nim">import strutils type Line 9,631 ⟶ 12,799: stdout.write('\n') if token.kind == tokEnd: break</~~lang~~syntaxhighlight> =={{header\|ObjectIcon}}== {{trans\|Icon}} {{trans\|ATS}} There are very few changes from the ordinary Icon version: I/O is modified to use FileStreams; and the '''max''' procedure is removed, because there is an Object Icon builtin procedure. <syntaxhighlight lang="objecticon"># -- ObjectIcon -- # # The Rosetta Code lexical analyzer in Object Icon. Based upon the ATS # implementation. # # Usage: lex [INPUTFILE [OUTPUTFILE]] # If INPUTFILE or OUTPUTFILE is "-" or missing, then standard input # or standard output is used, respectively. ) # import io $define EOF -1 $define TOKEN_ELSE 0 $define TOKEN_IF 1 $define TOKEN_PRINT 2 $define TOKEN_PUTC 3 $define TOKEN_WHILE 4 $define TOKEN_MULTIPLY 5 $define TOKEN_DIVIDE 6 $define TOKEN_MOD 7 $define TOKEN_ADD 8 $define TOKEN_SUBTRACT 9 $define TOKEN_NEGATE 10 $define TOKEN_LESS 11 $define TOKEN_LESSEQUAL 12 $define TOKEN_GREATER 13 $define TOKEN_GREATEREQUAL 14 $define TOKEN_EQUAL 15 $define TOKEN_NOTEQUAL 16 $define TOKEN_NOT 17 $define TOKEN_ASSIGN 18 $define TOKEN_AND 19 $define TOKEN_OR 20 $define TOKEN_LEFTPAREN 21 $define TOKEN_RIGHTPAREN 22 $define TOKEN_LEFTBRACE 23 $define TOKEN_RIGHTBRACE 24 $define TOKEN_SEMICOLON 25 $define TOKEN_COMMA 26 $define TOKEN_IDENTIFIER 27 $define TOKEN_INTEGER 28 $define TOKEN_STRING 29 $define TOKEN_END_OF_INPUT 30 global whitespace global ident_start global ident_continuation procedure main(args) local inpf, outf local pushback_buffer, inp, pushback initial { whitespace := ' \t\v\f\r\n' ident_start := '_' ++ &letters ident_continuation := ident_start ++ &digits } inpf := FileStream.stdin outf := FileStream.stdout if 1 <= args & args[1] ~== "-" then { inpf := FileStream(args[1], FileOpt.RDONLY) \| stop(&why) } if 2 <= args & args[2] ~== "-" then { outf := FileStream(args[2], ior(FileOpt.WRONLY, FileOpt.TRUNC, FileOpt.CREAT)) \| stop(&why) } pushback_buffer := [] inp := create inputter(inpf, pushback_buffer) pushback := create repeat push(pushback_buffer, \@&source) @pushback # The first invocation does nothing. scan_text(outf, inp, pushback) end procedure scan_text(outf, inp, pushback) local ch while /ch \| ch[1] ~=== EOF do { skip_spaces_and_comments(inp, pushback) ch := @inp if ch[1] === EOF then { print_token(outf, [TOKEN_END_OF_INPUT, "", ch[2], ch[3]]) } else { ch @pushback print_token(outf, get_next_token(inp, pushback)) } } end procedure get_next_token(inp, pushback) local ch, ch1 local ln, cn skip_spaces_and_comments(inp, pushback) ch := @inp ln := ch[2] # line number cn := ch[3] # column number case ch[1] of { "," : return [TOKEN_COMMA, ",", ln, cn] ";" : return [TOKEN_SEMICOLON, ";", ln, cn] "(" : return [TOKEN_LEFTPAREN, "(", ln, cn] ")" : return [TOKEN_RIGHTPAREN, ")", ln, cn] "{" : return [TOKEN_LEFTBRACE, "{", ln, cn] "}" : return [TOKEN_RIGHTBRACE, "}", ln, cn] "" : return [TOKEN_MULTIPLY, "", ln, cn] "/" : return [TOKEN_DIVIDE, "/", ln, cn] "%" : return [TOKEN_MOD, "%", ln, cn] "+" : return [TOKEN_ADD, "+", ln, cn] "-" : return [TOKEN_SUBTRACT, "-", ln, cn] "<" : { ch1 := @inp if ch1[1] === "=" then { return [TOKEN_LESSEQUAL, "<=", ln, cn] } else { ch1 @pushback return [TOKEN_LESS, "<", ln, cn] } } ">" : { ch1 := @inp if ch1[1] === "=" then { return [TOKEN_GREATEREQUAL, ">=", ln, cn] } else { ch1 @pushback return [TOKEN_GREATER, ">", ln, cn] } } "=" : { ch1 := @inp if ch1[1] === "=" then { return [TOKEN_EQUAL, "==", ln, cn] } else { ch1 @pushback return [TOKEN_ASSIGN, "=", ln, cn] } } "!" : { ch1 := @inp if ch1[1] === "=" then { return [TOKEN_NOTEQUAL, "!=", ln, cn] } else { ch1 @pushback return [TOKEN_NOT, "!", ln, cn] } } "&" : { ch1 := @inp if ch1[1] === "&" then { return [TOKEN_AND, "&&", ln, cn] } else { unexpected_character(ln, cn, ch) } } "\|" : { ch1 := @inp if ch1[1] === "\|" then { return [TOKEN_OR, "\|\|", ln, cn] } else { unexpected_character(ln, cn, ch) } } "\"" : { ch @pushback return scan_string_literal(inp) } "'" : { ch @pushback return scan_character_literal(inp, pushback) } default : { if any(&digits, ch[1]) then { ch @pushback return scan_integer_literal(inp, pushback) } else if any(ident_start, ch[1]) then { ch @pushback return scan_identifier_or_reserved_word (inp, pushback) } else { unexpected_character(ln, cn, ch) } } } end procedure scan_identifier_or_reserved_word(inp, pushback) local ch local s local line_no, column_no s := "" ch := @inp line_no := ch[2] column_no := ch[3] while EOF ~=== ch[1] & any(ident_continuation, ch[1]) do { s \|\|:= ch[1] ch := @inp } ch @pushback return reserved_word_lookup (s, line_no, column_no) end procedure scan_integer_literal(inp, pushback) local ch local s local line_no, column_no s := "" ch := @inp line_no := ch[2] column_no := ch[3] while EOF ~=== ch[1] & any(ident_continuation, ch[1]) do { s \|\|:= ch[1] ch := @inp } ch @pushback not upto(~&digits, s) \| invalid_integer_literal(line_no, column_no, s) return [TOKEN_INTEGER, s, line_no, column_no] end procedure scan_character_literal(inp, pushback) local ch, ch1 local close_quote local toktup local line_no, column_no ch := @inp # The opening quote. close_quote := ch[1] # Same as the opening quote. ch @pushback line_no := ch[2] column_no := ch[3] toktup := scan_character_literal_without_checking_end(inp) ch1 := @inp if ch1[1] ~=== close_quote then { repeat { case ch1[1] of { EOF : unterminated_character_literal(line_no, column_no) close_quote : multicharacter_literal(line_no, column_no) default : ch1 := @inp } } } return toktup end procedure scan_character_literal_without_checking_end(inp) local ch, ch1, ch2 ch := @inp # The opening quote. ch1 := @inp EOF ~=== ch1[1] \| unterminated_character_literal(ch[2], ch[3]) if ch1[1] == "\\" then { ch2 := @inp EOF ~=== ch2[1] \| unterminated_character_literal(ch[2], ch[3]) case ch2[1] of { "n" : return [TOKEN_INTEGER, string(ord("\n")), ch[2], ch[3]] "\\" : return [TOKEN_INTEGER, string(ord("\\")), ch[2], ch[3]] default : unsupported_escape(ch1[2], ch1[3], ch2) } } else { return [TOKEN_INTEGER, string(ord(ch1[1])), ch[2], ch[3]] } end procedure scan_string_literal(inp) local ch, ch1, ch2 local line_no, column_no local close_quote local s local retval ch := @inp # The opening quote close_quote := ch[1] # Same as the opening quote. line_no := ch[2] column_no := ch[3] s := ch[1] until \retval do { ch1 := @inp ch1[1] ~=== EOF \| unterminated_string_literal (line_no, column_no, "end of input") ch1[1] ~== "\n" \| unterminated_string_literal (line_no, column_no, "end of line") if ch1[1] == close_quote then { retval := [TOKEN_STRING, s \|\| close_quote, line_no, column_no] } else if ch1[1] ~== "\\" then { s \|\|:= ch1[1] } else { ch2 := @inp EOF ~=== ch2[1] \| unsupported_escape(line_no, column_no, ch2) case ch2[1] of { "n" : s \|\|:= "\\n" "\\" : s \|\|:= "\\\\" default : unsupported_escape(line_no, column_no, ch2) } } } return retval end procedure skip_spaces_and_comments(inp, pushback) local ch, ch1 repeat { ch := @inp (EOF === ch[1]) & { ch @pushback; return } if not any(whitespace, ch[1]) then { (ch[1] == "/") \| { ch @pushback; return } (ch1 := @inp) \| { ch @pushback; return } (ch1[1] == "") \| { ch1 @pushback; ch @pushback; return } scan_comment(inp, ch[2], ch[3]) } } end procedure scan_comment(inp, line_no, column_no) local ch, ch1 until (\ch)[1] == "" & (\ch1)[1] == "/" do { ch := @inp (EOF === ch[1]) & unterminated_comment(line_no, column_no) if ch[1] == "" then { ch1 := @inp (EOF === ch[1]) & unterminated_comment(line_no, column_no) } } return end procedure reserved_word_lookup(s, line_no, column_no) # Lookup is by an extremely simple perfect hash. static reserved_words static reserved_word_tokens local hashval, token, toktup initial { reserved_words := ["if", "print", "else", "", "putc", "", "", "while", ""] reserved_word_tokens := [TOKEN_IF, TOKEN_PRINT, TOKEN_ELSE, TOKEN_IDENTIFIER, TOKEN_PUTC, TOKEN_IDENTIFIER, TOKEN_IDENTIFIER, TOKEN_WHILE, TOKEN_IDENTIFIER] } if s < 2 then { toktup := [TOKEN_IDENTIFIER, s, line_no, column_no] } else { hashval := ((ord(s[1]) + ord(s[2])) % (reserved_words)) + 1 token := reserved_word_tokens[hashval] if token = TOKEN_IDENTIFIER \| s ~== reserved_words[hashval] then { toktup := [TOKEN_IDENTIFIER, s, line_no, column_no] } else { toktup := [token, s, line_no, column_no] } } return toktup end procedure print_token(outf, toktup) static token_names local s_line, s_column initial { token_names := ["Keyword_else", "Keyword_if", "Keyword_print", "Keyword_putc", "Keyword_while", "Op_multiply", "Op_divide", "Op_mod", "Op_add", "Op_subtract", "Op_negate", "Op_less", "Op_lessequal", "Op_greater", "Op_greaterequal", "Op_equal", "Op_notequal", "Op_not", "Op_assign", "Op_and", "Op_or", "LeftParen", "RightParen", "LeftBrace", "RightBrace", "Semicolon", "Comma", "Identifier", "Integer", "String", "End_of_input"] } /outf := FileStream.stdout s_line := string(toktup[3]) s_column := string(toktup[4]) writes(outf, right (s_line, max(5, s_line))) writes(outf, " ") writes(outf, right (s_column, max(5, s_column))) writes(outf, " ") writes(outf, token_names[toktup[1] + 1]) case toktup[1] of { TOKEN_IDENTIFIER : writes(outf, " ", toktup[2]) TOKEN_INTEGER : writes(outf, " ", toktup[2]) TOKEN_STRING : writes(outf, " ", toktup[2]) } write(outf) return end procedure inputter(inpf, pushback_buffer) local buffer local line_no, column_no local c buffer := "" line_no := 1 column_no := 1 repeat { buffer? { until pushback_buffer = 0 & pos(0) do { if pushback_buffer ~= 0 then { suspend pop(pushback_buffer) } else { c := move(1) suspend [c, line_no, column_no] if c == "\n" then { line_no +:= 1 column_no := 1 } else { column_no +:= 1 } } } } (buffer := reads(inpf, 2048)) \| suspend [EOF, line_no, column_no] } end procedure unterminated_comment(line_no, column_no) error("unterminated comment starting at ", line_no, ":", column_no) end procedure unexpected_character(line_no, column_no, ch) error("unexpected character '", ch[1], "' starting at ", line_no, ":", column_no) end procedure unterminated_string_literal (line_no, column_no, cause) error("unterminated string literal (", cause, ") starting at ", line_no, ":", column_no) end procedure unsupported_escape (line_no, column_no, ch) if ch[1] === EOF then { error("unexpected \\ at end of input", " starting at ", line_no, ":", column_no) } else { error("unsupported escape \\", ch[1], " starting at ", line_no, ":", column_no) } end procedure invalid_integer_literal(line_no, column_no, s) error("invalid integer literal ", s, " starting at ", line_no, ":", column_no) end procedure unterminated_character_literal(line_no, column_no) error("unterminated character literal starting at ", line_no, ":", column_no) end procedure multicharacter_literal(line_no, column_no) error("unsupported multicharacter literal starting at ", line_no, ":", column_no) end procedure error(args[]) write!([FileStream.stderr] \|\|\| args) exit(1) end</syntaxhighlight> {{out}} <pre>$ oit -s -o lex lex-in-ObjectIcon.icn && ./lex compiler-tests/testcase3.t 5 16 Keyword_print 5 40 Op_subtract 6 16 Keyword_putc 6 40 Op_less 7 16 Keyword_if 7 40 Op_greater 8 16 Keyword_else 8 40 Op_lessequal 9 16 Keyword_while 9 40 Op_greaterequal 10 16 LeftBrace 10 40 Op_equal 11 16 RightBrace 11 40 Op_notequal 12 16 LeftParen 12 40 Op_and 13 16 RightParen 13 40 Op_or 14 16 Op_subtract 14 40 Semicolon 15 16 Op_not 15 40 Comma 16 16 Op_multiply 16 40 Op_assign 17 16 Op_divide 17 40 Integer 42 18 16 Op_mod 18 40 String "String literal" 19 16 Op_add 19 40 Identifier variable_name 20 26 Integer 10 21 26 Integer 92 22 26 Integer 32 23 1 End_of_input</pre> =={{header\|OCaml}}== {{works with\|OCaml\|4.12.1}} {{trans\|ATS}} This is a close translation of the ATS. It may interest the reader to compare the two implementations. (Much of the extra complication in the ATS comes from arrays being a linear type (whose "views" need tending), and from values of linear type having to be local to any function using them. This limitation could have been worked around, and arrays more similar to OCaml arrays could have been used, but at a cost in safety and efficiency.) <syntaxhighlight lang="ocaml">(------------------------------------------------------------------) (* The Rosetta Code lexical analyzer, in OCaml. Based on the ATS. ) ( When you compare this code to the ATS code, please keep in mind that, although ATS has an ML-like syntax: * The type system is not the same at all. * Most ATS functions are not closures. Those that are will have special notations such as "<cloref1>" associated with them. ) (------------------------------------------------------------------) ( The following functions are compatible with ASCII. ) let is_digit ichar = 48 <= ichar && ichar <= 57 let is_lower ichar = 97 <= ichar && ichar <= 122 let is_upper ichar = 65 <= ichar && ichar <= 90 let is_alpha ichar = is_lower ichar \|\| is_upper ichar let is_alnum ichar = is_digit ichar \|\| is_alpha ichar let is_ident_start ichar = is_alpha ichar \|\| ichar = 95 let is_ident_continuation ichar = is_alnum ichar \|\| ichar = 95 let is_space ichar = ichar = 32 \|\| (9 <= ichar && ichar <= 13) (------------------------------------------------------------------) let reverse_list_to_string lst = List.rev lst \|> List.to_seq \|> String.of_seq (------------------------------------------------------------------) ( Character input more like that of C. There are various advantages and disadvantages to this method, but key points in its favor are: (a) it is how character input is done in the original ATS code, (b) Unicode code points are 21-bit positive integers. ) let eof = (-1) let input_ichar channel = try int_of_char (input_char channel) with \| End_of_file -> eof (------------------------------------------------------------------) ( The type of an input character. ) module Ch = struct type t = { ichar : int; line_no : int; column_no : int } end (------------------------------------------------------------------) ( Inputting with unlimited pushback, and with counting of lines and columns. ) module Inp = struct type t = { inpf : in_channel; pushback : Ch.t list; line_no : int; column_no : int } let of_in_channel inpf = { inpf = inpf; pushback = []; line_no = 1; column_no = 1 } let get_ch inp = match inp.pushback with \| ch :: tail -> (ch, {inp with pushback = tail}) \| [] -> let ichar = input_ichar inp.inpf in if ichar = int_of_char '\n' then ({ ichar = ichar; line_no = inp.line_no; column_no = inp.column_no }, { inp with line_no = inp.line_no + 1; column_no = 1 }) else ({ ichar = ichar; line_no = inp.line_no; column_no = inp.column_no }, { inp with column_no = inp.column_no + 1 }) let push_back_ch ch inp = {inp with pushback = ch :: inp.pushback} end (------------------------------------------------------------------) ( Tokens, appearing in tuples with arguments, and with line and column numbers. The tokens are integers, so they can be used as array indices. ) ( (token, argument, line_no, column_no) ) type toktup_t = int string * int * int let token_ELSE = 0 let token_IF = 1 let token_PRINT = 2 let token_PUTC = 3 let token_WHILE = 4 let token_MULTIPLY = 5 let token_DIVIDE = 6 let token_MOD = 7 let token_ADD = 8 let token_SUBTRACT = 9 let token_NEGATE = 10 let token_LESS = 11 let token_LESSEQUAL = 12 let token_GREATER = 13 let token_GREATEREQUAL = 14 let token_EQUAL = 15 let token_NOTEQUAL = 16 let token_NOT = 17 let token_ASSIGN = 18 let token_AND = 19 let token_OR = 20 let token_LEFTPAREN = 21 let token_RIGHTPAREN = 22 let token_LEFTBRACE = 23 let token_RIGHTBRACE = 24 let token_SEMICOLON = 25 let token_COMMA = 26 let token_IDENTIFIER = 27 let token_INTEGER = 28 let token_STRING = 29 let token_END_OF_INPUT = 30 ;; (* A very simple perfect hash for the reserved words. (Yes, this is overkill, except for demonstration of the principle.) ) let reserved_words = [\| "if"; "print"; "else"; ""; "putc"; ""; ""; "while"; "" \|] let reserved_word_tokens = [\| token_IF; token_PRINT; token_ELSE; token_IDENTIFIER; token_PUTC; token_IDENTIFIER; token_IDENTIFIER; token_WHILE; token_IDENTIFIER \|] let reserved_word_lookup s line_no column_no = if String.length s < 2 then (token_IDENTIFIER, s, line_no, column_no) else let hashval = (int_of_char s.[0] + int_of_char s.[1]) mod 9 in let token = reserved_word_tokens.(hashval) in if token = token_IDENTIFIER \|\| s <> reserved_words.(hashval) then (token_IDENTIFIER, s, line_no, column_no) else (token, s, line_no, column_no) ( Token to string lookup. ) let token_names = [\| "Keyword_else"; "Keyword_if"; "Keyword_print"; "Keyword_putc"; "Keyword_while"; "Op_multiply"; "Op_divide"; "Op_mod"; "Op_add"; "Op_subtract"; "Op_negate"; "Op_less"; "Op_lessequal"; "Op_greater"; "Op_greaterequal"; "Op_equal"; "Op_notequal"; "Op_not"; "Op_assign"; "Op_and"; "Op_or"; "LeftParen"; "RightParen"; "LeftBrace"; "RightBrace"; "Semicolon"; "Comma"; "Identifier"; "Integer"; "String"; "End_of_input" \|] let token_name token = token_names.(token) (------------------------------------------------------------------) exception Unterminated_comment of int int exception Unterminated_character_literal of int * int exception Multicharacter_literal of int * int exception End_of_input_in_string_literal of int * int exception End_of_line_in_string_literal of int * int exception Unsupported_escape of int * int * int exception Invalid_integer_literal of int * int * string exception Unexpected_character of int * int * char (------------------------------------------------------------------) (* Skipping past spaces and comments. (A comment in the target language is, if you think about it, a kind of space.) ) let scan_comment inp line_no column_no = let rec loop inp = let (ch, inp) = Inp.get_ch inp in if ch.ichar = eof then raise (Unterminated_comment (line_no, column_no)) else if ch.ichar = int_of_char '' then let (ch1, inp) = Inp.get_ch inp in if ch1.ichar = eof then raise (Unterminated_comment (line_no, column_no)) else if ch1.ichar = int_of_char '/' then inp else loop inp else loop inp in loop inp let skip_spaces_and_comments inp = let rec loop inp = let (ch, inp) = Inp.get_ch inp in if is_space ch.ichar then loop inp else if ch.ichar = int_of_char '/' then let (ch1, inp) = Inp.get_ch inp in if ch1.ichar = int_of_char '' then scan_comment inp ch.line_no ch.column_no \|> loop else let inp = Inp.push_back_ch ch1 inp in let inp = Inp.push_back_ch ch inp in inp else Inp.push_back_ch ch inp in loop inp (------------------------------------------------------------------) ( Integer literals, identifiers, and reserved words. ) ( In ATS the predicate for simple scan was supplied by template expansion, which (typically) produces faster code than passing a function or closure (although passing either of those could have been done). Here I pass the predicate as a function/closure. It is worth contrasting the methods. ) let rec simple_scan pred lst inp = let (ch, inp) = Inp.get_ch inp in if pred ch.ichar then simple_scan pred (char_of_int ch.ichar :: lst) inp else (lst, Inp.push_back_ch ch inp) ( Demonstration of one way to make a new closure in OCaml. (In ATS, one might see things that look similar but are actually template operations.) ) let simple_scan_iic = simple_scan is_ident_continuation let scan_integer_literal inp = let (ch, inp) = Inp.get_ch inp in let _ = assert (is_digit ch.ichar) in let (lst, inp) = simple_scan_iic [char_of_int ch.ichar] inp in let s = reverse_list_to_string lst in if List.for_all (fun c -> is_digit (int_of_char c)) lst then ((token_INTEGER, s, ch.line_no, ch.column_no), inp) else raise (Invalid_integer_literal (ch.line_no, ch.column_no, s)) let scan_identifier_or_reserved_word inp = let (ch, inp) = Inp.get_ch inp in let _ = assert (is_ident_start ch.ichar) in let (lst, inp) = simple_scan_iic [char_of_int ch.ichar] inp in let s = reverse_list_to_string lst in let toktup = reserved_word_lookup s ch.line_no ch.column_no in (toktup, inp) (------------------------------------------------------------------) ( String literals. ) let scan_string_literal inp = let (ch, inp) = Inp.get_ch inp in let _ = assert (ch.ichar = int_of_char '"') in let rec scan lst inp = let (ch1, inp) = Inp.get_ch inp in if ch1.ichar = eof then raise (End_of_input_in_string_literal (ch.line_no, ch.column_no)) else if ch1.ichar = int_of_char '\n' then raise (End_of_line_in_string_literal (ch.line_no, ch.column_no)) else if ch1.ichar = int_of_char '"' then (lst, inp) else if ch1.ichar <> int_of_char '\\' then scan (char_of_int ch1.ichar :: lst) inp else let (ch2, inp) = Inp.get_ch inp in if ch2.ichar = int_of_char 'n' then scan ('n' :: '\\' :: lst) inp else if ch2.ichar = int_of_char '\\' then scan ('\\' :: '\\' :: lst) inp else raise (Unsupported_escape (ch1.line_no, ch1.column_no, ch2.ichar)) in let lst = '"' :: [] in let (lst, inp) = scan lst inp in let lst = '"' :: lst in let s = reverse_list_to_string lst in ((token_STRING, s, ch.line_no, ch.column_no), inp) (------------------------------------------------------------------) ( Character literals. ) let scan_character_literal_without_checking_end inp = let (ch, inp) = Inp.get_ch inp in let _ = assert (ch.ichar = int_of_char '\'') in let (ch1, inp) = Inp.get_ch inp in if ch1.ichar = eof then raise (Unterminated_character_literal (ch.line_no, ch.column_no)) else if ch1.ichar = int_of_char '\\' then let (ch2, inp) = Inp.get_ch inp in if ch2.ichar = eof then raise (Unterminated_character_literal (ch.line_no, ch.column_no)) else if ch2.ichar = int_of_char 'n' then let s = (int_of_char '\n' \|> string_of_int) in ((token_INTEGER, s, ch.line_no, ch.column_no), inp) else if ch2.ichar = int_of_char '\\' then let s = (int_of_char '\\' \|> string_of_int) in ((token_INTEGER, s, ch.line_no, ch.column_no), inp) else raise (Unsupported_escape (ch1.line_no, ch1.column_no, ch2.ichar)) else let s = string_of_int ch1.ichar in ((token_INTEGER, s, ch.line_no, ch.column_no), inp) let scan_character_literal inp = let (toktup, inp) = scan_character_literal_without_checking_end inp in let (_, _, line_no, column_no) = toktup in let check_end inp = let (ch, inp) = Inp.get_ch inp in if ch.ichar = int_of_char '\'' then inp else let rec loop_to_end (ch1 : Ch.t) inp = if ch1.ichar = eof then raise (Unterminated_character_literal (line_no, column_no)) else if ch1.ichar = int_of_char '\'' then raise (Multicharacter_literal (line_no, column_no)) else let (ch1, inp) = Inp.get_ch inp in loop_to_end ch1 inp in loop_to_end ch inp in let inp = check_end inp in (toktup, inp) (------------------------------------------------------------------) let get_next_token inp = let inp = skip_spaces_and_comments inp in let (ch, inp) = Inp.get_ch inp in let ln = ch.line_no in let cn = ch.column_no in if ch.ichar = eof then ((token_END_OF_INPUT, "", ln, cn), inp) else match char_of_int ch.ichar with \| ',' -> ((token_COMMA, ",", ln, cn), inp) \| ';' -> ((token_SEMICOLON, ";", ln, cn), inp) \| '(' -> ((token_LEFTPAREN, "(", ln, cn), inp) \| ')' -> ((token_RIGHTPAREN, ")", ln, cn), inp) \| '{' -> ((token_LEFTBRACE, "{", ln, cn), inp) \| '}' -> ((token_RIGHTBRACE, "}", ln, cn), inp) \| '' -> ((token_MULTIPLY, "", ln, cn), inp) \| '/' -> ((token_DIVIDE, "/", ln, cn), inp) \| '%' -> ((token_MOD, "%", ln, cn), inp) \| '+' -> ((token_ADD, "+", ln, cn), inp) \| '-' -> ((token_SUBTRACT, "-", ln, cn), inp) \| '<' -> let (ch1, inp) = Inp.get_ch inp in if ch1.ichar = int_of_char '=' then ((token_LESSEQUAL, "<=", ln, cn), inp) else let inp = Inp.push_back_ch ch1 inp in ((token_LESS, "<", ln, cn), inp) \| '>' -> let (ch1, inp) = Inp.get_ch inp in if ch1.ichar = int_of_char '=' then ((token_GREATEREQUAL, ">=", ln, cn), inp) else let inp = Inp.push_back_ch ch1 inp in ((token_GREATER, ">", ln, cn), inp) \| '=' -> let (ch1, inp) = Inp.get_ch inp in if ch1.ichar = int_of_char '=' then ((token_EQUAL, "==", ln, cn), inp) else let inp = Inp.push_back_ch ch1 inp in ((token_ASSIGN, "=", ln, cn), inp) \| '!' -> let (ch1, inp) = Inp.get_ch inp in if ch1.ichar = int_of_char '=' then ((token_NOTEQUAL, "!=", ln, cn), inp) else let inp = Inp.push_back_ch ch1 inp in ((token_NOT, "!", ln, cn), inp) \| '&' -> let (ch1, inp) = Inp.get_ch inp in if ch1.ichar = int_of_char '&' then ((token_AND, "&&", ln, cn), inp) else raise (Unexpected_character (ch.line_no, ch.column_no, char_of_int ch.ichar)) \| '\|' -> let (ch1, inp) = Inp.get_ch inp in if ch1.ichar = int_of_char '\|' then ((token_OR, "\|\|", ln, cn), inp) else raise (Unexpected_character (ch.line_no, ch.column_no, char_of_int ch.ichar)) \| '"' -> let inp = Inp.push_back_ch ch inp in scan_string_literal inp \| '\'' -> let inp = Inp.push_back_ch ch inp in scan_character_literal inp \| _ when is_digit ch.ichar -> let inp = Inp.push_back_ch ch inp in scan_integer_literal inp \| _ when is_ident_start ch.ichar -> let inp = Inp.push_back_ch ch inp in scan_identifier_or_reserved_word inp \| _ -> raise (Unexpected_character (ch.line_no, ch.column_no, char_of_int ch.ichar)) let print_token outf toktup = let (token, arg, line_no, column_no) = toktup in let name = token_name token in let (padding, str) = match 0 with \| _ when token = token_IDENTIFIER -> (" ", arg) \| _ when token = token_INTEGER -> (" ", arg) \| _ when token = token_STRING -> (" ", arg) \| _ -> ("", "") in Printf.fprintf outf "%5d %5d %s%s%s\n" line_no column_no name padding str let scan_text outf inp = let rec loop inp = let (toktup, inp) = get_next_token inp in begin print_token outf toktup; let (token, _, _, _) = toktup in if token <> token_END_OF_INPUT then loop inp end in loop inp (------------------------------------------------------------------) let main () = let inpf_filename = if 2 <= Array.length Sys.argv then Sys.argv.(1) else "-" in let outf_filename = if 3 <= Array.length Sys.argv then Sys.argv.(2) else "-" in let inpf = if inpf_filename = "-" then stdin else open_in inpf_filename in let outf = if outf_filename = "-" then stdout else open_out outf_filename in let inp = Inp.of_in_channel inpf in scan_text outf inp ;; main () (------------------------------------------------------------------)</syntaxhighlight> {{out}} <pre>$ ocamlopt -O2 lex.ml && ./a.out compiler-tests/testcase3.t 5 16 Keyword_print 5 40 Op_subtract 6 16 Keyword_putc 6 40 Op_less 7 16 Keyword_if 7 40 Op_greater 8 16 Keyword_else 8 40 Op_lessequal 9 16 Keyword_while 9 40 Op_greaterequal 10 16 LeftBrace 10 40 Op_equal 11 16 RightBrace 11 40 Op_notequal 12 16 LeftParen 12 40 Op_and 13 16 RightParen 13 40 Op_or 14 16 Op_subtract 14 40 Semicolon 15 16 Op_not 15 40 Comma 16 16 Op_multiply 16 40 Op_assign 17 16 Op_divide 17 40 Integer 42 18 16 Op_mod 18 40 String "String literal" 19 16 Op_add 19 40 Identifier variable_name 20 26 Integer 10 21 26 Integer 92 22 26 Integer 32 23 1 End_of_input</pre> =={{header\|Ol}}== Line 9,637 ⟶ 13,924: Note: we do not print the line and token source code position for the simplicity. <~~lang~~syntaxhighlight lang="scheme"> (import (owl parse)) Line 9,761 ⟶ 14,048: (if (null? (cdr stream)) (print 'End_of_input)))) </syntaxhighlight> ~~</lang>~~ ==== Testing ==== Testing function: <~~lang~~syntaxhighlight lang="scheme"> (define (translate source) (let ((stream (try-parse token-parser (str-iter source) #t))) Line 9,772 ⟶ 14,059: (if (null? (force (cdr stream))) (print 'End_of_input)))) </syntaxhighlight> ~~</lang>~~ ====== Testcase 1 ====== <~~lang~~syntaxhighlight lang="scheme"> (translate " / Line 9,782 ⟶ 14,069: / print(\"Hello, World!\\\\n\"); ")</~~lang~~syntaxhighlight> {{Out}} <pre> Line 9,795 ⟶ 14,082: ====== Testcase 2 ====== <~~lang~~syntaxhighlight lang="scheme"> (translate " / Line 9,802 ⟶ 14,089: phoenix_number = 142857; print(phoenix_number, \"\\\\n\"); ")</~~lang~~syntaxhighlight> {{Out}} <pre> Line 9,821 ⟶ 14,108: ====== Testcase 3 ====== <~~lang~~syntaxhighlight lang="scheme"> (translate " /* Line 9,845 ⟶ 14,132: /* character literal / '\\\\' / character literal / ' ' ")</~~lang~~syntaxhighlight> {{Out}} <pre> Line 9,886 ⟶ 14,173: ====== Testcase 4 ====== <~~lang~~syntaxhighlight lang="scheme"> (translate " /** test printing, embedded \\\\n and comments with lots of '' */ Line 9,893 ⟶ 14,180: print(\"Print a slash n - \\\\\\\\n.\\\\n\"); ") </syntaxhighlight> ~~</lang>~~ {{Out}} <pre> Line 9,916 ⟶ 14,203: =={{header\|Perl}}== <~~lang~~syntaxhighlight lang="perl">#!/usr/bin/env perl use strict; Line 10,055 ⟶ 14,342: ($line, $col) } }</~~lang~~syntaxhighlight> {{out\|case=test case 3}} Line 10,098 ⟶ 14,385: ===Alternate Perl Solution=== Tested on perl v5.26.1 <~~lang~~syntaxhighlight ~~Perl~~lang="perl">#!/usr/bin/perl use strict; # lex.pl - source to tokens Line 10,134 ⟶ 14,421: 1 + $` =~ tr/\n//, 1 + length $` =~ s/.\n//sr, $^R; } printf "%5d %7d %s\n", 1 + tr/\n//, 1, 'End_of_input';</~~lang~~syntaxhighlight> =={{header\|Phix}}== Line 10,141 ⟶ 14,428: form. If required, demo\rosetta\Compiler\extra.e (below) contains some code that achieves the latter. Code to print the human readable forms is likewise kept separate from any re-usable parts. <!--<~~lang~~syntaxhighlight ~~Phix~~lang="phix">(phixonline)--> <span style="color: #000080;font-style:italic;">-- -- demo\rosetta\Compiler\core.e Line 10,301 ⟶ 14,588: <span style="color: #008080;">return</span> <span style="color: #000000;">s</span> <span style="color: #008080;">end</span> <span style="color: #008080;">function</span> <!--</~~lang~~syntaxhighlight>--> For running under pwa/p2js, we also have a "fake file/io" component: <!--<~~lang~~syntaxhighlight ~~Phix~~lang="phix">(phixonline)--> <span style="color: #000080;font-style:italic;">-- -- demo\rosetta\Compiler\js_io.e Line 10,312 ⟶ 14,599: --</span> <span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span> <span style="color: #~~008080~~004080;">~~constant~~sequence</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">known_files</span><span style="color: #0000FF;">,</span><span style="color: #000000;">kfc</span><span style="color: #0000FF;">}</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">columnize</span><span style="color: #0000FF;">({</span> <span style="color: #0000FF;">{</span><span style="color: #008000;">"test3.c"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">""" /* Line 10,377 ⟶ 14,664: } print(a); """</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span> <span style="color: #0000FF;">{</span><span style="color: #008000;">"Header.h"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">""" #define area(h, w) h * w """</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span> <span style="color: #0000FF;">{</span><span style="color: #008000;">"Source.t"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">""" #include "Header.h" #define width 5 #define height 6 area = #area(height, width)#; """</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)}})</span> <span style="color: #004080;">~~integer~~sequence</span> <span style="color: #000000;">fnlinenos</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">repeat</span><span style="color: #0000FF;">(-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">~~lineno~~known_files</span><span style="color: #0000FF;">))</span> <span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">js_open</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">filename</span><span style="color: #0000FF;">)</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">filename</span><span style="color: #0000FF;">,</span><span style="color: #000000;">known_files</span><span style="color: #0000FF;">)</span> <span style="color: #7060A8;">assert</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">!=</span><span style="color: #000000;">0</span><span style="color: #0000FF;">)</span> <span style="color: #000000;">~~lineno~~linenos</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span> <span style="color: #008080;">return</span> <span style="color: #000000;">fn</span> <span style="color: #008080;">end</span> <span style="color: #008080;">function</span> <span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">js_gets</span><span style="color: #0000FF;">(</span><span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span><span style="color: #0000FF;">)</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">lineno</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">linenos</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">]+</span><span style="color: #000000;">1</span> <span style="color: #008080;">if</span> <span style="color: #000000;">lineno</span><span style="color: #0000FF;">><=</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">kfc</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">])</span> <span style="color: #008080;">then~~</span> <span style="color: #008080;">return</span> <span style="color: #000000;">EOF</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if~~</span> ~~<span~~ ~~style="color:~~ ~~#008080;">return</span>~~ <span style="color: #000000;">~~kfc~~linenos</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">][</span> <span style="color: #~~000000~~0000FF;">~~lineno~~=</span> <span style="color: #~~0000FF~~000000;">]lineno</span> <span style="color: #008080;">return</span> <span style="color: #000000;">kfc</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">][</span><span style="color: #000000;">lineno</span><span style="color: #0000FF;">]</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span> <span style="color: #008080;">return</span> <span style="color: #000000;">EOF</span> <span style="color: #008080;">end</span> <span style="color: #008080;">function</span> <!--</~~lang~~syntaxhighlight>--> The main lexer is also written to be reusable by later stages. <!--<~~lang~~syntaxhighlight ~~Phix~~lang="phix">(phixonline)--> <span style="color: #000080;font-style:italic;">-- -- demo\\rosetta\\Compiler\\lex.e Line 10,423 ⟶ 14,722: <span style="color: #000000;">line</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span> <span style="color: #000000;">col</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span> <span style="color: #000000;">oneline</span> <span style="color: #0000FF;">=</span> <span style="color: #008080;">iff</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">platform</span><span style="color: #0000FF;">()=</span><span style="color: #004600;">JS</span><span style="color: #0000FF;">?</span><span style="color: #000000;">js_gets</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">)</span> <span style="color: #0000FF;">:</span><span style="color: #7060A8;">gets</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">))</span> <span style="color: #008080;">else</span> Line 10,433 ⟶ 14,732: <span style="color: #008080;">end</span> <span style="color: #008080;">function</span> <span style="color: #000080;font-style:italic;">-- for pwa/p2js (JavaScript really dislikes tabs in strings): --constant whitespace = " \t\r\n\x0B\xA0"</span> <span style="color: #008080;">constant</span> <span style="color: #000000;">whitespace</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\t'</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\r'</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\n'</span><span style="color: #0000FF;">,</span><span style="color: #000000;">#0B</span><span style="color: #0000FF;">,</span><span style="color: #000000;">#A0</span><span style="color: #0000FF;">}</span> <span style="color: #000080;font-style:italic;">-- (0x0B is Vertical Tab, 0xA0 is Non-breaking space)</span> Line 10,513 ⟶ 14,814: <span style="color: #008080;">function</span> <span style="color: #000000;">get_op</span><span style="color: #0000FF;">()</span> <span style="color: #000080;font-style:italic;">-- sequence operator = {ch}</span> <span style="color: #004080;">string</span> <span style="color: #000000;">operator</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span><span style="color: #0000FF;">&</span><span style="color: #000000;">ch</span> <span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span> Line 10,579 ⟶ 14,881: <span style="color: #008080;">return</span> <span style="color: #000000;">toks</span> <span style="color: #008080;">end</span> <span style="color: #008080;">function</span> <!--</~~lang~~syntaxhighlight>--> Optional: if you need human-readable output/input at each (later) stage, so you can use pipes <!--<~~lang~~syntaxhighlight ~~Phix~~lang="phix">--> <span style="color: #000080;font-style:italic;">-- -- demo\rosetta\Compiler\extra.e Line 10,634 ⟶ 14,936: <span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">n_type</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">left</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">right</span><span style="color: #0000FF;">}</span> <span style="color: #008080;">end</span> <span style="color: #008080;">function</span> <!--</~~lang~~syntaxhighlight>--> Finally, a simple test driver for the specific task: <!--<~~lang~~syntaxhighlight ~~Phix~~lang="phix">(phixonline)--> <span style="color: #000080;font-style:italic;">-- -- demo\rosetta\Compiler\lex.exw Line 10,664 ⟶ 14,966: <span style="color: #000080;font-style:italic;">--main(command_line())</span> <span style="color: #000000;">main</span><span style="color: #0000FF;">({</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"test4.c"</span><span style="color: #0000FF;">})</span> <!--</~~lang~~syntaxhighlight>--> {{out}} <pre> Line 10,687 ⟶ 14,989: =={{header\|Prolog}}== <~~lang~~syntaxhighlight lang="prolog">/* Test harness for the analyzer, not needed if we are actually using the output. / Line 10,847 ⟶ 15,149: % anything else is an error tok(_,_,L,P) --> { format(atom(Error), 'Invalid token at line ~d,~d', [L,P]), throw(Error) }.</~~lang~~syntaxhighlight> {{out}} <pre> Line 10,888 ⟶ 15,190: =={{header\|Python}}== Tested with Python 2.7 and 3.x <~~lang~~syntaxhighlight ~~Python~~lang="python">from __future__ import print_function import sys Line 10,969 ⟶ 15,271: #** "string" def string_lit(start, err_line, err_col): global the_ch text = "" Line 10,976 ⟶ 15,279: if the_ch == '\n': error(err_line, err_col, "EOL while scanning string literal") if the_ch == '\\': next_ch() if the_ch != 'n': error(err_line, err_col, "escape sequence unknown \\%c" % the_ch) the_ch = '\n' text += the_ch Line 11,063 ⟶ 15,371: if tok == tk_EOI: break</~~lang~~syntaxhighlight> {{out\|case=test case 3}} Line 11,107 ⟶ 15,415: =={{header\|QB64}}== Tested with QB64 1.5 <~~lang~~syntaxhighlight lang="vb">dim shared source as string, the_ch as string, tok as string, toktyp as string dim shared line_n as integer, col_n as integer, text_p as integer, err_line as integer, err_col as integer, errors as integer Line 11,347 ⟶ 15,655: end end sub </syntaxhighlight> ~~</lang>~~ {{out\|case=test case 3}} <b> Line 11,387 ⟶ 15,695: =={{header\|Racket}}== <~~lang~~syntaxhighlight lang="racket"> #lang racket (require parser-tools/lex) Line 11,543 ⟶ 15,851: "TEST 5" (display-tokens (string->tokens test5)) </syntaxhighlight> ~~</lang>~~ =={{header\|Raku}}== Line 11,553 ⟶ 15,861: {{works with\|Rakudo\|2016.08}} <syntaxhighlight lang="raku" ~~perl6~~line>grammar tiny_C { rule TOP { ^ <.whitespace>? <tokens> + % <.whitespace> <.whitespace> <eoi> } Line 11,646 ⟶ 15,954: my $tokenizer = tiny_C.parse(@ARGS[0].IO.slurp); parse_it( $tokenizer );</~~lang~~syntaxhighlight> {{out\|case=test case 3}} Line 11,685 ⟶ 15,993: 23 1 End_of_input </pre> =={{header\|RATFOR}}== {{works with\|ratfor77\|[https://sourceforge.net/p/chemoelectric/ratfor77/ public domain 1.0]}} {{works with\|gfortran\|11.2.1}} {{works with\|f2c\|20100827}} <syntaxhighlight lang="ratfor">###################################################################### # # The Rosetta Code scanner in Ratfor 77. # # # How to deal with FORTRAN 77 input is a problem. I use formatted # input, treating each line as an array of type CHARACTER--regrettably # of no more than some predetermined, finite length. It is a very # simple method and presents no significant difficulties, aside from # the restriction on line length of the input. # # # On a POSIX platform, the program can be compiled with f2c and run # somewhat as follows: # # ratfor77 lex-in-ratfor.r > lex-in-ratfor.f # f2c -C -Nc40 lex-in-ratfor.f # cc -O lex-in-ratfor.c -lf2c # ./a.out < compiler-tests/primes.t # # With gfortran, a little differently: # # ratfor77 lex-in-ratfor.r > lex-in-ratfor.f # gfortran -O2 -fcheck=all -std=legacy lex-in-ratfor.f # ./a.out < compiler-tests/primes.t # # # I/O is strictly from default input and to default output, which, on # POSIX systems, usually correspond respectively to standard input and # standard output. (I did not wish to have to deal with unit numbers; # these are now standardized in ISO_FORTRAN_ENV, but that is not # available in FORTRAN 77.) # #--------------------------------------------------------------------- # Some parameters you may with to modify. define(LINESZ, 256) # Size of an input line. define(OUTLSZ, 512) # Size of an output line. define(PSHBSZ, 10) # Size of the character pushback buffer. define(STRNSZ, 4096) # Size of the string pool. #--------------------------------------------------------------------- define(EOF, -1) define(NEWLIN, 10) # Unix newline (the LF control character). define(BACKSL, 92) # ASCII backslash. define(ILINNO, 1) # Line number's index. define(ICOLNO, 2) # Column number's index. define(CHRSZ, 3) # See ILINNO and ICOLNO above. define(ICHRCD, 3) # Character code's index. define(TOKSZ, 5) # See ILINNO and ICOLNO above. define(ITOKNO, 3) # Token number's index. define(IARGIX, 4) # Index of the string pool index. define(IARGLN, 5) # Index of the string length. define(TKELSE, 0) define(TKIF, 1) define(TKPRNT, 2) define(TKPUTC, 3) define(TKWHIL, 4) define(TKMUL, 5) define(TKDIV, 6) define(TKMOD, 7) define(TKADD, 8) define(TKSUB, 9) define(TKNEG, 10) define(TKLT, 11) define(TKLE, 12) define(TKGT, 13) define(TKGE, 14) define(TKEQ, 15) define(TKNE, 16) define(TKNOT, 17) define(TKASGN, 18) define(TKAND, 19) define(TKOR, 20) define(TKLPAR, 21) define(TKRPAR, 22) define(TKLBRC, 23) define(TKRBRC, 24) define(TKSEMI, 25) define(TKCMMA, 26) define(TKID, 27) define(TKINT, 28) define(TKSTR, 29) define(TKEOI, 30) define(LOC10, 1) # Location of "10" in the string pool. define(LOC92, 3) # Location of "92" in the string pool. #--------------------------------------------------------------------- subroutine addstr (strngs, istrng, src, i0, n0, i, n) # Add a string to the string pool. implicit none character strngs(STRNSZ) # String pool. integer istrng # String pool's next slot. character src() # Source string. integer i0, n0 # Index and length in source string. integer i, n # Index and length in string pool. integer j if (STRNSZ < istrng + (n0 - 1)) { write (, '(''string pool exhausted'')') stop } for (j = 0; j < n0; j = j + 1) strngs(istrng + j) = src(i0 + j) i = istrng n = n0 istrng = istrng + n0 end subroutine cpystr (strngs, i, n, dst, i0) # Copy a string from the string pool to an output string. implicit none character strngs(STRNSZ) # String pool. integer i, n # Index and length in string pool. character dst(OUTLSZ) # Destination string. integer i0 # Index within destination string. integer j if (i0 < 1 \|\| OUTLSZ < i0 + (n - 1)) { write (, '(''string boundary exceeded'')') stop } for (j = 0; j < n; j = j + 1) dst(i0 + j) = strngs(i + j) end #--------------------------------------------------------------------- subroutine getchr (line, linno, colno, pushbk, npshbk, chr) # Get a character, with its line number and column number. implicit none character line(LINESZ) # Input buffer. integer linno, colno # Current line and column numbers. integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer. integer npshbk # Number of characters pushed back. integer chr(CHRSZ) # End of file is indicated (as in C) by a negative "char code" # called "EOF". character20 fmt integer stat integer chr1(CHRSZ) if (0 < npshbk) { chr(ICHRCD) = pushbk(ICHRCD, npshbk) chr(ILINNO) = pushbk(ILINNO, npshbk) chr(ICOLNO) = pushbk(ICOLNO, npshbk) npshbk = npshbk - 1 } else if (colno <= LINESZ) { chr(ICHRCD) = ichar (line(colno)) chr(ILINNO) = linno chr(ICOLNO) = colno colno = colno + 1 } else { # Return a newline character. chr(ICHRCD) = NEWLIN chr(ILINNO) = linno chr(ICOLNO) = colno # Fetch a new line. linno = linno + 1 colno = 1 # Read a line of text as an array of characters. write (fmt, '(''('', I10, ''A1)'')') LINESZ read (, fmt, iostat = stat) line if (stat != 0) { # If end of file has been reached, push an EOF. chr1(ICHRCD) = EOF chr1(ILINNO) = linno chr1(ICOLNO) = colno call pshchr (pushbk, npshbk, chr1) } } end subroutine pshchr (pushbk, npshbk, chr) # Push back a character. implicit none integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer. integer npshbk # Number of characters pushed back. integer chr(CHRSZ) if (PSHBSZ <= npshbk) { write (, '(''pushback buffer overfull'')') stop } npshbk = npshbk + 1 pushbk(ICHRCD, npshbk) = chr(ICHRCD) pushbk(ILINNO, npshbk) = chr(ILINNO) pushbk(ICOLNO, npshbk) = chr(ICOLNO) end subroutine getpos (line, linno, colno, pushbk, npshbk, ln, cn) # Get the position of the next character. implicit none character line(LINESZ) # Input buffer. integer linno, colno # Current line and column numbers. integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer. integer npshbk # Number of characters pushed back. integer ln, cn # The line and column nos. returned. integer chr(CHRSZ) call getchr (line, linno, colno, pushbk, npshbk, chr) ln = chr(ILINNO) cn = chr(ICOLNO) call pshchr (pushbk, npshbk, chr) end #--------------------------------------------------------------------- function isspc (c) # Is c character code for a space? implicit none integer c logical isspc # # The following is correct for ASCII: 32 is the SPACE character, and # 9 to 13 are control characters commonly regarded as spaces. # # In Unicode these are all code points for spaces, but so are others # besides. # isspc = (c == 32 \|\| (9 <= c && c <= 13)) end function isdgt (c) # Is c character code for a digit? implicit none integer c logical isdgt isdgt = (ichar ('0') <= c && c <= ichar ('9')) end function isalph (c) # Is c character code for a letter? implicit none integer c logical isalph # # The following is correct for ASCII and Unicode, but not for # EBCDIC. # isalph = (ichar ('a') <= c && c <= ichar ('z')) _ \|\| (ichar ('A') <= c && c <= ichar ('Z')) end function isid0 (c) # Is c character code for the start of an identifier? implicit none integer c logical isid0 logical isalph isid0 = isalph (c) \|\| c == ichar ('_') end function isid1 (c) # Is c character code for the continuation of an identifier? implicit none integer c logical isid1 logical isalph logical isdgt isid1 = isalph (c) \|\| isdgt (c) \|\| c == ichar ('_') end #--------------------------------------------------------------------- function trimlf (str, n) # "Trim left" leading spaces. implicit none character str() # The string to "trim". integer n # The length. integer trimlf # The index of the first non-space # character, or n + 1. logical isspc integer j logical done j = 1 done = .false. while (!done) { if (j == n + 1) done = .true. else if (!isspc (ichar (str(j)))) done = .true. else j = j + 1 } trimlf = j end function trimrt (str, n) # "Trim right" trailing spaces. implicit none character str() # The string to "trim". integer n # The length including trailing spaces. integer trimrt # The length without trailing spaces. logical isspc integer j logical done j = n done = .false. while (!done) { if (j == 0) done = .true. else if (!isspc (ichar (str(j)))) done = .true. else j = j - 1 } trimrt = j end #--------------------------------------------------------------------- subroutine toknam (tokno, str, i) # Copy a token name to the character array str, starting at i. implicit none integer tokno character str() integer i integer j character16 names(0:30) character16 nm data names / "Keyword_else ", _ "Keyword_if ", _ "Keyword_print ", _ "Keyword_putc ", _ "Keyword_while ", _ "Op_multiply ", _ "Op_divide ", _ "Op_mod ", _ "Op_add ", _ "Op_subtract ", _ "Op_negate ", _ "Op_less ", _ "Op_lessequal ", _ "Op_greater ", _ "Op_greaterequal ", _ "Op_equal ", _ "Op_notequal ", _ "Op_not ", _ "Op_assign ", _ "Op_and ", _ "Op_or ", _ "LeftParen ", _ "RightParen ", _ "LeftBrace ", _ "RightBrace ", _ "Semicolon ", _ "Comma ", _ "Identifier ", _ "Integer ", _ "String ", _ "End_of_input " / nm = names(tokno) for (j = 0; j < 16; j = j + 1) str(i + j) = nm(1 + j : 1 + j) end subroutine intstr (str, i, n, x) # Convert a positive integer to a substring. implicit none character str() # Destination string. integer i, n # Index and length of the field. integer x # The positive integer to represent. integer j integer y if (x == 0) { for (j = 0; j < n - 1; j = j + 1) str(i + j) = ' ' str(i + j) = '0' } else { y = x for (j = n - 1; 0 <= j; j = j - 1) { if (y == 0) str(i + j) = ' ' else { str(i + j) = char (mod (y, 10) + ichar ('0')) y = y / 10 } } } end subroutine prttok (strngs, tok) # Print a token. implicit none character strngs(STRNSZ) # String pool. integer tok(TOKSZ) # The token. integer trimrt character line(OUTLSZ) character20 fmt integer i, n integer tokno for (i = 1; i <= OUTLSZ; i = i + 1) line(i) = ' ' call intstr (line, 1, 10, tok(ILINNO)) call intstr (line, 12, 10, tok(ICOLNO)) tokno = tok(ITOKNO) call toknam (tokno, line, 25) if (tokno == TKID \|\| tokno == TKINT \|\| tokno == TKSTR) { i = tok(IARGIX) n = tok(IARGLN) call cpystr (strngs, i, n, line, 45) } n = trimrt (line, OUTLSZ) write (fmt, '(''('', I10, ''A)'')') n write (, fmt) (line(i), i = 1, n) end #--------------------------------------------------------------------- subroutine wrtpos (ln, cn) implicit none integer ln, cn write (, 1000) ln, cn 1000 format ('At line ', I5, ', column ' I5) end #--------------------------------------------------------------------- subroutine utcmnt (ln, cn) implicit none integer ln, cn call wrtpos (ln, cn) write (, '(''Unterminated comment'')') stop end subroutine skpcmt (line, linno, colno, pushbk, npshbk, ln, cn) # Skip to the end of a comment. implicit none character line(LINESZ) # Input buffer. integer linno, colno # Current line and column numbers. integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer. integer npshbk # Number of characters pushed back. integer ln, cn # Line and column of start of comment. integer chr(CHRSZ) logical done done = .false. while (!done) { call getchr (line, linno, colno, pushbk, npshbk, chr) if (chr(ICHRCD) == ichar ('')) { call getchr (line, linno, colno, pushbk, npshbk, chr) if (chr(ICHRCD) == ichar ('/')) done = .true. else if (chr(ICHRCD) == EOF) call utcmnt (ln, cn) } else if (chr(ICHRCD) == EOF) call utcmnt (ln, cn) } end subroutine skpspc (line, linno, colno, pushbk, npshbk) # Skip spaces and comments. implicit none character line(LINESZ) # Input buffer. integer linno, colno # Current line and column numbers. integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer. integer npshbk # Number of characters pushed back. logical isspc integer chr(CHRSZ) integer chr1(CHRSZ) integer ln, cn logical done done = .false. while (!done) { call getchr (line, linno, colno, pushbk, npshbk, chr) if (!isspc (chr(ICHRCD))) { if (chr(ICHRCD) != ichar ('/')) { call pshchr (pushbk, npshbk, chr) done = .true. } else { call getchr (line, linno, colno, pushbk, npshbk, chr1) if (chr1(ICHRCD) != ichar ('')) { call pshchr (pushbk, npshbk, chr1) call pshchr (pushbk, npshbk, chr) done = .true. } else { ln = chr(ILINNO) cn = chr(ICOLNO) call skpcmt (line, linno, colno, pushbk, npshbk, _ ln, cn) } } } } end #--------------------------------------------------------------------- subroutine rwdlkp (strngs, istrng, src, i0, n0, ln, cn, tok) # Reserved word lookup implicit none character strngs(STRNSZ) # String pool. integer istrng # String pool's next slot. character src() # The source string. integer i0, n0 # Index and length of the substring. integer ln, cn # Line and column number # to associate with the token. integer tok(TOKSZ) # The output token. integer tokno integer i, n tokno = TKID if (n0 == 2) { if (src(i0) == 'i' && src(i0 + 1) == 'f') tokno = TKIF } else if (n0 == 4) { if (src(i0) == 'e' && src(i0 + 1) == 'l' _ && src(i0 + 2) == 's' && src(i0 + 3) == 'e') tokno = TKELSE else if (src(i0) == 'p' && src(i0 + 1) == 'u' _ && src(i0 + 2) == 't' && src(i0 + 3) == 'c') tokno = TKPUTC } else if (n0 == 5) { if (src(i0) == 'p' && src(i0 + 1) == 'r' _ && src(i0 + 2) == 'i' && src(i0 + 3) == 'n' _ && src(i0 + 4) == 't') tokno = TKPRNT else if (src(i0) == 'w' && src(i0 + 1) == 'h' _ && src(i0 + 2) == 'i' && src(i0 + 3) == 'l' _ && src(i0 + 4) == 'e') tokno = TKWHIL } i = 0 n = 0 if (tokno == TKID) call addstr (strngs, istrng, src, i0, n0, i, n) tok(ITOKNO) = tokno tok(IARGIX) = i tok(IARGLN) = n tok(ILINNO) = ln tok(ICOLNO) = cn end subroutine scnwrd (line, linno, colno, pushbk, npshbk, ln, cn, buf, n) # Scan characters that may represent an identifier, reserved word, # or integer literal. implicit none character line(LINESZ) # Input buffer. integer linno, colno # Current line and column numbers. integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer. integer npshbk # Number of characters pushed back. integer ln, cn # Line and column number of the start. character buf(LINESZ) # The output buffer. integer n # The length of the string collected. logical isid1 integer chr(CHRSZ) n = 0 call getchr (line, linno, colno, pushbk, npshbk, chr) ln = chr(ILINNO) cn = chr(ICOLNO) while (isid1 (chr(ICHRCD))) { n = n + 1 buf(n) = char (chr(ICHRCD)) call getchr (line, linno, colno, pushbk, npshbk, chr) } call pshchr (pushbk, npshbk, chr) end subroutine scnidr (strngs, istrng, _ line, linno, colno, pushbk, npshbk, _ tok) # Scan an identifier or reserved word. implicit none character strngs(STRNSZ) # String pool. integer istrng # String pool's next slot. character line(LINESZ) # Input buffer. integer linno, colno # Current line and column numbers. integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer. integer npshbk # Number of characters pushed back. integer ln, cn # Line and column number of the start. integer tok(TOKSZ) # The output token. character buf(LINESZ) integer n call scnwrd (line, linno, colno, pushbk, npshbk, ln, cn, buf, n) call rwdlkp (strngs, istrng, buf, 1, n, ln, cn, tok) end subroutine scnint (strngs, istrng, _ line, linno, colno, pushbk, npshbk, _ tok) # Scan a positive integer literal. implicit none character strngs(STRNSZ) # String pool. integer istrng # String pool's next slot. character line(LINESZ) # Input buffer. integer linno, colno # Current line and column numbers. integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer. integer npshbk # Number of characters pushed back. integer ln, cn # Line and column number of the start. integer tok(TOKSZ) # The output token. logical isdgt character buf(LINESZ) integer n0, n integer i, j, k character80 fmt call scnwrd (line, linno, colno, pushbk, npshbk, ln, cn, buf, n0) for (j = 1; j <= n0; j = j + 1) if (!isdgt (ichar (buf(j)))) { call wrtpos (ln, cn) write (fmt, 1000) n0 1000 format ('(''Not a legal word: "''', I10, 'A, ''"'')') write (, fmt) (buf(k), k = 1, n0) stop } call addstr (strngs, istrng, buf, 1, n0, i, n) tok(ITOKNO) = TKINT tok(IARGIX) = i tok(IARGLN) = n tok(ILINNO) = ln tok(ICOLNO) = cn end subroutine utclit (ln, cn) implicit none integer ln, cn call wrtpos (ln, cn) write (, '(''Unterminated character literal'')') stop end subroutine scnch1 (strngs, istrng, _ line, linno, colno, pushbk, npshbk, _ tok) # Scan a character literal, without yet checking that the literal # ends correctly. implicit none character strngs(STRNSZ) # String pool. integer istrng # String pool's next slot. character line(LINESZ) # Input buffer. integer linno, colno # Current line and column numbers. integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer. integer npshbk # Number of characters pushed back. integer tok(TOKSZ) # The output token. integer trimlf integer bufsz parameter (bufsz = 40) integer chr(CHRSZ) integer chr1(CHRSZ) integer chr2(CHRSZ) integer ln, cn character buf(bufsz) integer i, j, n # Refetch the opening quote. call getchr (line, linno, colno, pushbk, npshbk, chr) ln = chr(ILINNO) cn = chr(ICOLNO) tok(ITOKNO) = TKINT tok(ILINNO) = ln tok(ICOLNO) = cn call getchr (line, linno, colno, pushbk, npshbk, chr1) if (chr1(ICHRCD) == EOF) call utclit (ln, cn) if (chr1(ICHRCD) == BACKSL) { call getchr (line, linno, colno, pushbk, npshbk, chr2) if (chr2(ICHRCD) == EOF) call utclit (ln, cn) else if (chr2(ICHRCD) == ichar ('n')) { tok(IARGIX) = LOC10 # "10" = code for Unix newline tok(IARGLN) = 2 } else if (chr2(ICHRCD) == BACKSL) { tok(IARGIX) = LOC92 # "92" = code for backslash tok(IARGLN) = 2 } else { call wrtpos (ln, cn) write (, '(''Unsupported escape: '', 1A)') _ char (chr2(ICHRCD)) stop } } else { # Character codes are non-negative, so we can use intstr. call intstr (buf, 1, bufsz, chr1(ICHRCD)) j = trimlf (buf, bufsz) call addstr (strngs, istrng, buf, j, bufsz - (j - 1), i, n) tok(IARGIX) = i tok(IARGLN) = n } end subroutine scnch (strngs, istrng, _ line, linno, colno, pushbk, npshbk, _ tok) # Scan a character literal. implicit none character strngs(STRNSZ) # String pool. integer istrng # String pool's next slot. character line(LINESZ) # Input buffer. integer linno, colno # Current line and column numbers. integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer. integer npshbk # Number of characters pushed back. integer tok(TOKSZ) # The output token. integer ln, cn integer chr(CHRSZ) call getpos (line, linno, colno, pushbk, npshbk, ln, cn) call scnch1 (strngs, istrng, _ line, linno, colno, pushbk, npshbk, _ tok) call getchr (line, linno, colno, pushbk, npshbk, chr) if (chr(ICHRCD) != ichar ('''')) { while (.true.) { if (chr(ICHRCD) == EOF) { call utclit (ln, cn) stop } else if (chr(ICHRCD) == ichar ('''')) { call wrtpos (ln, cn) write (, '(''Unsupported multicharacter literal'')') stop } call getchr (line, linno, colno, pushbk, npshbk, chr) } } end subroutine scnstr (strngs, istrng, _ line, linno, colno, pushbk, npshbk, _ tok) # Scan a string literal. implicit none character strngs(STRNSZ) # String pool. integer istrng # String pool's next slot. character line(LINESZ) # Input buffer. integer linno, colno # Current line and column numbers. integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer. integer npshbk # Number of characters pushed back. integer tok(TOKSZ) # The output token. integer ln, cn integer chr1(CHRSZ) integer chr2(CHRSZ) character buf(LINESZ + 10) # Enough space, with some room to spare. integer n0 integer i, n call getchr (line, linno, colno, pushbk, npshbk, chr1) ln = chr1(ILINNO) cn = chr1(ICOLNO) tok(ITOKNO) = TKSTR tok(ILINNO) = ln tok(ICOLNO) = cn n0 = 1 buf(n0) = '"' call getchr (line, linno, colno, pushbk, npshbk, chr1) while (chr1(ICHRCD) != ichar ('"')) { # Our input method always puts a NEWLIN before EOF, and so this # test is redundant, unless someone changes the input method. if (chr1(ICHRCD) == EOF \|\| chr1(ICHRCD) == NEWLIN) { call wrtpos (ln, cn) write (, '(''Unterminated string literal'')') stop } if (chr1(ICHRCD) == BACKSL) { call getchr (line, linno, colno, pushbk, npshbk, chr2) if (chr2(ICHRCD) == ichar ('n')) { n0 = n0 + 1 buf(n0) = char (BACKSL) n0 = n0 + 1 buf(n0) = 'n' } else if (chr2(ICHRCD) == BACKSL) { n0 = n0 + 1 buf(n0) = char (BACKSL) n0 = n0 + 1 buf(n0) = char (BACKSL) } else { call wrtpos (chr1(ILINNO), chr1(ICOLNO)) write (, '(''Unsupported escape sequence'')') stop } } else { n0 = n0 + 1 buf(n0) = char (chr1(ICHRCD)) } call getchr (line, linno, colno, pushbk, npshbk, chr1) } n0 = n0 + 1 buf(n0) = '"' call addstr (strngs, istrng, buf, 1, n0, i, n) tok(IARGIX) = i tok(IARGLN) = n end subroutine unxchr (chr) implicit none integer chr(CHRSZ) call wrtpos (chr(ILINNO), chr(ICOLNO)) write (, 1000) char (chr(ICHRCD)) 1000 format ('Unexpected character ''', A1, '''') stop end subroutine scntok (strngs, istrng, _ line, linno, colno, pushbk, npshbk, _ tok) # Scan a token. implicit none character strngs(STRNSZ) # String pool. integer istrng # String pool's next slot. character line(LINESZ) # Input buffer. integer linno, colno # Current line and column numbers. integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer. integer npshbk # Number of characters pushed back. integer tok(TOKSZ) # The output token. logical isdgt logical isid0 integer chr(CHRSZ) integer chr1(CHRSZ) integer ln, cn call getchr (line, linno, colno, pushbk, npshbk, chr) ln = chr(ILINNO) cn = chr(ICOLNO) tok(ILINNO) = ln tok(ICOLNO) = cn tok(IARGIX) = 0 tok(IARGLN) = 0 if (chr(ICHRCD) == ichar (',')) tok(ITOKNO) = TKCMMA else if (chr(ICHRCD) == ichar (';')) tok(ITOKNO) = TKSEMI else if (chr(ICHRCD) == ichar ('(')) tok(ITOKNO) = TKLPAR else if (chr(ICHRCD) == ichar (')')) tok(ITOKNO) = TKRPAR else if (chr(ICHRCD) == ichar ('{')) tok(ITOKNO) = TKLBRC else if (chr(ICHRCD) == ichar ('}')) tok(ITOKNO) = TKRBRC else if (chr(ICHRCD) == ichar ('')) tok(ITOKNO) = TKMUL else if (chr(ICHRCD) == ichar ('/')) tok(ITOKNO) = TKDIV else if (chr(ICHRCD) == ichar ('%')) tok(ITOKNO) = TKMOD else if (chr(ICHRCD) == ichar ('+')) tok(ITOKNO) = TKADD else if (chr(ICHRCD) == ichar ('-')) tok(ITOKNO) = TKSUB else if (chr(ICHRCD) == ichar ('<')) { call getchr (line, linno, colno, pushbk, npshbk, chr1) if (chr1(ICHRCD) == ichar ('=')) tok(ITOKNO) = TKLE else { call pshchr (pushbk, npshbk, chr1) tok(ITOKNO) = TKLT } } else if (chr(ICHRCD) == ichar ('>')) { call getchr (line, linno, colno, pushbk, npshbk, chr1) if (chr1(ICHRCD) == ichar ('=')) tok(ITOKNO) = TKGE else { call pshchr (pushbk, npshbk, chr1) tok(ITOKNO) = TKGT } } else if (chr(ICHRCD) == ichar ('=')) { call getchr (line, linno, colno, pushbk, npshbk, chr1) if (chr1(ICHRCD) == ichar ('=')) tok(ITOKNO) = TKEQ else { call pshchr (pushbk, npshbk, chr1) tok(ITOKNO) = TKASGN } } else if (chr(ICHRCD) == ichar ('!')) { call getchr (line, linno, colno, pushbk, npshbk, chr1) if (chr1(ICHRCD) == ichar ('=')) tok(ITOKNO) = TKNE else { call pshchr (pushbk, npshbk, chr1) tok(ITOKNO) = TKNOT } } else if (chr(ICHRCD) == ichar ('&')) { call getchr (line, linno, colno, pushbk, npshbk, chr1) if (chr1(ICHRCD) == ichar ('&')) tok(ITOKNO) = TKAND else call unxchr (chr) } else if (chr(ICHRCD) == ichar ('\|')) { call getchr (line, linno, colno, pushbk, npshbk, chr1) if (chr1(ICHRCD) == ichar ('\|')) tok(ITOKNO) = TKOR else call unxchr (chr) } else if (chr(ICHRCD) == ichar ('"')) { call pshchr (pushbk, npshbk, chr) call scnstr (strngs, istrng, _ line, linno, colno, pushbk, npshbk, _ tok) } else if (chr(ICHRCD) == ichar ('''')) { call pshchr (pushbk, npshbk, chr) call scnch (strngs, istrng, _ line, linno, colno, pushbk, npshbk, _ tok) } else if (isdgt (chr(ICHRCD))) { call pshchr (pushbk, npshbk, chr) call scnint (strngs, istrng, _ line, linno, colno, pushbk, npshbk, _ tok) } else if (isid0 (chr(ICHRCD))) { call pshchr (pushbk, npshbk, chr) call scnidr (strngs, istrng, _ line, linno, colno, pushbk, npshbk, _ tok) } else call unxchr (chr) end subroutine scntxt (strngs, istrng, _ line, linno, colno, pushbk, npshbk) # Scan the text and print the token stream. implicit none character strngs(STRNSZ) # String pool. integer istrng # String pool's next slot. character line(LINESZ) # Input buffer. integer linno, colno # Current line and column numbers. integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer. integer npshbk # Number of characters pushed back. integer chr(CHRSZ) integer tok(TOKSZ) chr(ICHRCD) = ichar ('x') while (chr(ICHRCD) != EOF) { call skpspc (line, linno, colno, pushbk, npshbk) call getchr (line, linno, colno, pushbk, npshbk, chr) if (chr(ICHRCD) != EOF) { call pshchr (pushbk, npshbk, chr) call scntok (strngs, istrng, _ line, linno, colno, pushbk, npshbk, _ tok) call prttok (strngs, tok) } } tok(ITOKNO) = TKEOI tok(ILINNO) = chr(ILINNO) tok(ICOLNO) = chr(ICOLNO) tok(IARGIX) = 0 tok(IARGLN) = 0 call prttok (strngs, tok) end #--------------------------------------------------------------------- program lex implicit none character strngs(STRNSZ) # String pool. integer istrng # String pool's next slot. character line(LINESZ) # Input buffer. integer linno, colno # Current line and column numbers. integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer. integer npshbk # Number of characters pushed back. integer i, n istrng = 1 # Locate "10" (newline) at 1 in the string pool. line(1) = '1' line(2) = '0' call addstr (strngs, istrng, line, 1, 2, i, n) if (i != 1 && n != 2) { write (, '(''internal error'')') stop } # Locate "92" (backslash) at 3 in the string pool. line(1) = '9' line(2) = '2' call addstr (strngs, istrng, line, 1, 2, i, n) if (i != 3 && n != 2) { write (, '(''internal error'')') stop } linno = 0 colno = LINESZ + 1 # This will trigger a READ. npshbk = 0 call scntxt (strngs, istrng, line, linno, colno, pushbk, npshbk) end ######################################################################</syntaxhighlight> {{out}} <pre>$ ratfor77 lex-in-ratfor.r > lex-in-ratfor.f && gfortran -O2 -std=legacy -fcheck=all lex-in-ratfor.f && ./a.out < compiler-tests/primes.t 4 1 Identifier count 4 7 Op_assign 4 9 Integer 1 4 10 Semicolon 5 1 Identifier n 5 3 Op_assign 5 5 Integer 1 5 6 Semicolon 6 1 Identifier limit 6 7 Op_assign 6 9 Integer 100 6 12 Semicolon 7 1 Keyword_while 7 7 LeftParen 7 8 Identifier n 7 10 Op_less 7 12 Identifier limit 7 17 RightParen 7 19 LeftBrace 8 5 Identifier k 8 6 Op_assign 8 7 Integer 3 8 8 Semicolon 9 5 Identifier p 9 6 Op_assign 9 7 Integer 1 9 8 Semicolon 10 5 Identifier n 10 6 Op_assign 10 7 Identifier n 10 8 Op_add 10 9 Integer 2 10 10 Semicolon 11 5 Keyword_while 11 11 LeftParen 11 12 LeftParen 11 13 Identifier k 11 14 Op_multiply 11 15 Identifier k 11 16 Op_lessequal 11 18 Identifier n 11 19 RightParen 11 21 Op_and 11 24 LeftParen 11 25 Identifier p 11 26 RightParen 11 27 RightParen 11 29 LeftBrace 12 9 Identifier p 12 10 Op_assign 12 11 Identifier n 12 12 Op_divide 12 13 Identifier k 12 14 Op_multiply 12 15 Identifier k 12 16 Op_notequal 12 18 Identifier n 12 19 Semicolon 13 9 Identifier k 13 10 Op_assign 13 11 Identifier k 13 12 Op_add 13 13 Integer 2 13 14 Semicolon 14 5 RightBrace 15 5 Keyword_if 15 8 LeftParen 15 9 Identifier p 15 10 RightParen 15 12 LeftBrace 16 9 Keyword_print 16 14 LeftParen 16 15 Identifier n 16 16 Comma 16 18 String " is prime\n" 16 31 RightParen 16 32 Semicolon 17 9 Identifier count 17 15 Op_assign 17 17 Identifier count 17 23 Op_add 17 25 Integer 1 17 26 Semicolon 18 5 RightBrace 19 1 RightBrace 20 1 Keyword_print 20 6 LeftParen 20 7 String "Total primes found: " 20 29 Comma 20 31 Identifier count 20 36 Comma 20 38 String "\n" 20 42 RightParen 20 43 Semicolon 21 1 End_of_input</pre> =={{header\|Scala}}== Line 11,691 ⟶ 17,336: The following code implements a configurable (from a symbol map and keyword map provided as parameters) lexical analyzer. <~~lang~~syntaxhighlight lang="scala"> package xyz.hyperreal.rosettacodeCompiler Line 11,952 ⟶ 17,597: } </syntaxhighlight> ~~</lang>~~ =={{header\|Scheme}}== <~~lang~~syntaxhighlight lang="scheme"> (import (scheme base) (scheme char) Line 12,153 ⟶ 17,798: (display-tokens (lexer (cadr (command-line)))) (display "Error: provide program filename\n")) </syntaxhighlight> ~~</lang>~~ {{out}} Line 12,165 ⟶ 17,810: 5 1 End_of_input </pre> =={{header\|Standard ML}}== {{trans\|ATS}} {{trans\|OCaml}} <syntaxhighlight lang="sml">(------------------------------------------------------------------) ( The Rosetta Code lexical analyzer, in Standard ML. Based on the ATS and the OCaml. The intended compiler is Mlton or Poly/ML; there is a tiny difference near the end of the file, depending on which compiler is used. ) (------------------------------------------------------------------) ( The following functions are compatible with ASCII. ) fun is_digit ichar = 48 <= ichar andalso ichar <= 57 fun is_lower ichar = 97 <= ichar andalso ichar <= 122 fun is_upper ichar = 65 <= ichar andalso ichar <= 90 fun is_alpha ichar = is_lower ichar orelse is_upper ichar fun is_alnum ichar = is_digit ichar orelse is_alpha ichar fun is_ident_start ichar = is_alpha ichar orelse ichar = 95 fun is_ident_continuation ichar = is_alnum ichar orelse ichar = 95 fun is_space ichar = ichar = 32 orelse (9 <= ichar andalso ichar <= 13) (------------------------------------------------------------------) ( Character input more like that of C. There are various advantages and disadvantages to this method, but key points in its favor are: (a) it is how character input is done in the original ATS code, (b) Unicode code points are 21-bit positive integers. ) val eof = ~1 fun input_ichar inpf = case TextIO.input1 inpf of NONE => eof \| SOME c => Char.ord c (------------------------------------------------------------------) ( The type of an input character. ) structure Ch = struct type t = { ichar : int, line_no : int, column_no : int } end (------------------------------------------------------------------) ( Inputting with unlimited pushback, and with counting of lines and columns. ) structure Inp = struct type t = { inpf : TextIO.instream, pushback : Ch.t list, line_no : int, column_no : int } fun of_instream inpf = { inpf = inpf, pushback = [], line_no = 1, column_no = 1 } : t fun get_ch ({ inpf = inpf, pushback = pushback, line_no = line_no, column_no = column_no } : t) = case pushback of ch :: tail => let val inp = { inpf = inpf, pushback = tail, line_no = line_no, column_no = column_no } in (ch, inp) end \| [] => let val ichar = input_ichar inpf val ch = { ichar = ichar, line_no = line_no, column_no = column_no } in if ichar = Char.ord #"\n" then let val inp = { inpf = inpf, pushback = [], line_no = line_no + 1, column_no = 1 } in (ch, inp) end else let val inp = { inpf = inpf, pushback = [], line_no = line_no, column_no = column_no + 1 } in (ch, inp) end end fun push_back_ch (ch, inp : t) = { inpf = #inpf inp, pushback = ch :: #pushback inp, line_no = #line_no inp, column_no = #column_no inp } end (------------------------------------------------------------------) ( Tokens, appearing in tuples with arguments, and with line and column numbers. The tokens are integers, so they can be used as array indices. ) val token_ELSE = 0 val token_IF = 1 val token_PRINT = 2 val token_PUTC = 3 val token_WHILE = 4 val token_MULTIPLY = 5 val token_DIVIDE = 6 val token_MOD = 7 val token_ADD = 8 val token_SUBTRACT = 9 val token_NEGATE = 10 val token_LESS = 11 val token_LESSEQUAL = 12 val token_GREATER = 13 val token_GREATEREQUAL = 14 val token_EQUAL = 15 val token_NOTEQUAL = 16 val token_NOT = 17 val token_ASSIGN = 18 val token_AND = 19 val token_OR = 20 val token_LEFTPAREN = 21 val token_RIGHTPAREN = 22 val token_LEFTBRACE = 23 val token_RIGHTBRACE = 24 val token_SEMICOLON = 25 val token_COMMA = 26 val token_IDENTIFIER = 27 val token_INTEGER = 28 val token_STRING = 29 val token_END_OF_INPUT = 30 ( A very simple perfect hash for the reserved words. (Yes, this is overkill, except for demonstration of the principle.) ) val reserved_words = Vector.fromList ["if", "print", "else", "", "putc", "", "", "while", ""] val reserved_word_tokens = Vector.fromList [token_IF, token_PRINT, token_ELSE, token_IDENTIFIER, token_PUTC, token_IDENTIFIER, token_IDENTIFIER, token_WHILE, token_IDENTIFIER] fun reserved_word_lookup (s, line_no, column_no) = if (String.size s) < 2 then (token_IDENTIFIER, s, line_no, column_no) else let val hashval = (Char.ord (String.sub (s, 0)) + Char.ord (String.sub (s, 1))) mod 9 val token = Vector.sub (reserved_word_tokens, hashval) in if token = token_IDENTIFIER orelse s <> Vector.sub (reserved_words, hashval) then (token_IDENTIFIER, s, line_no, column_no) else (token, s, line_no, column_no) end ( Token to string lookup. ) val token_names = Vector.fromList ["Keyword_else", "Keyword_if", "Keyword_print", "Keyword_putc", "Keyword_while", "Op_multiply", "Op_divide", "Op_mod", "Op_add", "Op_subtract", "Op_negate", "Op_less", "Op_lessequal", "Op_greater", "Op_greaterequal", "Op_equal", "Op_notequal", "Op_not", "Op_assign", "Op_and", "Op_or", "LeftParen", "RightParen", "LeftBrace", "RightBrace", "Semicolon", "Comma", "Identifier", "Integer", "String", "End_of_input"] fun token_name token = Vector.sub (token_names, token) (------------------------------------------------------------------) exception Unterminated_comment of int int exception Unterminated_character_literal of int * int exception Multicharacter_literal of int * int exception End_of_input_in_string_literal of int * int exception End_of_line_in_string_literal of int * int exception Unsupported_escape of int * int * char exception Invalid_integer_literal of int * int * string exception Unexpected_character of int * int * char (------------------------------------------------------------------) (* Skipping past spaces and comments. (In the Rosetta Code tiny language, a comment, if you think about it, is a kind of space.) ) fun scan_comment (inp, line_no, column_no) = let fun loop inp = let val (ch, inp) = Inp.get_ch inp in if #ichar ch = eof then raise Unterminated_comment (line_no, column_no) else if #ichar ch = Char.ord #"" then let val (ch1, inp) = Inp.get_ch inp in if #ichar ch1 = eof then raise Unterminated_comment (line_no, column_no) else if #ichar ch1 = Char.ord #"/" then inp else loop inp end else loop inp end in loop inp end fun skip_spaces_and_comments inp = let fun loop inp = let val (ch, inp) = Inp.get_ch inp in if is_space (#ichar ch) then loop inp else if #ichar ch = Char.ord #"/" then let val (ch1, inp) = Inp.get_ch inp in if #ichar ch1 = Char.ord #"" then loop (scan_comment (inp, #line_no ch, #column_no ch)) else let val inp = Inp.push_back_ch (ch1, inp) val inp = Inp.push_back_ch (ch, inp) in inp end end else Inp.push_back_ch (ch, inp) end in loop inp end (------------------------------------------------------------------) ( Integer literals, identifiers, and reserved words. ) fun scan_word (lst, inp) = let val (ch, inp) = Inp.get_ch inp in if is_ident_continuation (#ichar ch) then scan_word (Char.chr (#ichar ch) :: lst, inp) else (lst, Inp.push_back_ch (ch, inp)) end fun scan_integer_literal inp = let val (ch, inp) = Inp.get_ch inp val (lst, inp) = scan_word ([Char.chr (#ichar ch)], inp) val s = String.implode (List.rev lst) in if List.all (fn c => is_digit (Char.ord c)) lst then ((token_INTEGER, s, #line_no ch, #column_no ch), inp) else raise Invalid_integer_literal (#line_no ch, #column_no ch, s) end fun scan_identifier_or_reserved_word inp = let val (ch, inp) = Inp.get_ch inp val (lst, inp) = scan_word ([Char.chr (#ichar ch)], inp) val s = String.implode (List.rev lst) val toktup = reserved_word_lookup (s, #line_no ch, #column_no ch) in (toktup, inp) end (------------------------------------------------------------------) ( String literals. ) fun scan_string_literal inp = let val (ch, inp) = Inp.get_ch inp fun scan (lst, inp) = let val (ch1, inp) = Inp.get_ch inp in if #ichar ch1 = eof then raise End_of_input_in_string_literal (#line_no ch, #column_no ch) else if #ichar ch1 = Char.ord #"\n" then raise End_of_line_in_string_literal (#line_no ch, #column_no ch) else if #ichar ch1 = Char.ord #"\"" then (lst, inp) else if #ichar ch1 <> Char.ord #"\\" then scan (Char.chr (#ichar ch1) :: lst, inp) else let val (ch2, inp) = Inp.get_ch inp in if #ichar ch2 = Char.ord #"n" then scan (#"n" :: #"\\" :: lst, inp) else if #ichar ch2 = Char.ord #"\\" then scan (#"\\" :: #"\\" :: lst, inp) else if #ichar ch2 = eof then raise End_of_input_in_string_literal (#line_no ch, #column_no ch) else if #ichar ch2 = Char.ord #"\n" then raise End_of_line_in_string_literal (#line_no ch, #column_no ch) else raise Unsupported_escape (#line_no ch1, #column_no ch1, Char.chr (#ichar ch2)) end end val lst = #"\"" :: [] val (lst, inp) = scan (lst, inp) val lst = #"\"" :: lst val s = String.implode (List.rev lst) in ((token_STRING, s, #line_no ch, #column_no ch), inp) end (------------------------------------------------------------------) ( Character literals. ) fun scan_character_literal_without_checking_end inp = let val (ch, inp) = Inp.get_ch inp val (ch1, inp) = Inp.get_ch inp in if #ichar ch1 = eof then raise Unterminated_character_literal (#line_no ch, #column_no ch) else if #ichar ch1 = Char.ord #"\\" then let val (ch2, inp) = Inp.get_ch inp in if #ichar ch2 = eof then raise Unterminated_character_literal (#line_no ch, #column_no ch) else if #ichar ch2 = Char.ord #"n" then let val s = Int.toString (Char.ord #"\n") in ((token_INTEGER, s, #line_no ch, #column_no ch), inp) end else if #ichar ch2 = Char.ord #"\\" then let val s = Int.toString (Char.ord #"\\") in ((token_INTEGER, s, #line_no ch, #column_no ch), inp) end else raise Unsupported_escape (#line_no ch1, #column_no ch1, Char.chr (#ichar ch2)) end else let val s = Int.toString (#ichar ch1) in ((token_INTEGER, s, #line_no ch, #column_no ch), inp) end end fun scan_character_literal inp = let val (toktup, inp) = scan_character_literal_without_checking_end inp val (_, _, line_no, column_no) = toktup fun check_end inp = let val (ch, inp) = Inp.get_ch inp in if #ichar ch = Char.ord #"'" then inp else let fun loop_to_end (ch1 : Ch.t, inp) = if #ichar ch1 = eof then raise Unterminated_character_literal (line_no, column_no) else if #ichar ch1 = Char.ord #"'" then raise Multicharacter_literal (line_no, column_no) else let val (ch1, inp) = Inp.get_ch inp in loop_to_end (ch1, inp) end in loop_to_end (ch, inp) end end val inp = check_end inp in (toktup, inp) end (------------------------------------------------------------------) fun get_next_token inp = let val inp = skip_spaces_and_comments inp val (ch, inp) = Inp.get_ch inp val ln = #line_no ch val cn = #column_no ch in if #ichar ch = eof then ((token_END_OF_INPUT, "", ln, cn), inp) else case Char.chr (#ichar ch) of #"," => ((token_COMMA, ",", ln, cn), inp) \| #";" => ((token_SEMICOLON, ";", ln, cn), inp) \| #"(" => ((token_LEFTPAREN, "(", ln, cn), inp) \| #")" => ((token_RIGHTPAREN, ")", ln, cn), inp) \| #"{" => ((token_LEFTBRACE, "{", ln, cn), inp) \| #"}" => ((token_RIGHTBRACE, "}", ln, cn), inp) \| #"" => ((token_MULTIPLY, "", ln, cn), inp) \| #"/" => ((token_DIVIDE, "/", ln, cn), inp) \| #"%" => ((token_MOD, "%", ln, cn), inp) \| #"+" => ((token_ADD, "+", ln, cn), inp) \| #"-" => ((token_SUBTRACT, "-", ln, cn), inp) \| #"<" => let val (ch1, inp) = Inp.get_ch inp in if #ichar ch1 = Char.ord #"=" then ((token_LESSEQUAL, "<=", ln, cn), inp) else let val inp = Inp.push_back_ch (ch1, inp) in ((token_LESS, "<", ln, cn), inp) end end \| #">" => let val (ch1, inp) = Inp.get_ch inp in if #ichar ch1 = Char.ord #"=" then ((token_GREATEREQUAL, ">=", ln, cn), inp) else let val inp = Inp.push_back_ch (ch1, inp) in ((token_GREATER, ">", ln, cn), inp) end end \| #"=" => let val (ch1, inp) = Inp.get_ch inp in if #ichar ch1 = Char.ord #"=" then ((token_EQUAL, "==", ln, cn), inp) else let val inp = Inp.push_back_ch (ch1, inp) in ((token_ASSIGN, "=", ln, cn), inp) end end \| #"!" => let val (ch1, inp) = Inp.get_ch inp in if #ichar ch1 = Char.ord #"=" then ((token_NOTEQUAL, "!=", ln, cn), inp) else let val inp = Inp.push_back_ch (ch1, inp) in ((token_NOT, "!", ln, cn), inp) end end \| #"&" => let val (ch1, inp) = Inp.get_ch inp in if #ichar ch1 = Char.ord #"&" then ((token_AND, "&&", ln, cn), inp) else raise Unexpected_character (#line_no ch, #column_no ch, Char.chr (#ichar ch)) end \| #"\|" => let val (ch1, inp) = Inp.get_ch inp in if #ichar ch1 = Char.ord #"\|" then ((token_OR, "\|\|", ln, cn), inp) else raise Unexpected_character (#line_no ch, #column_no ch, Char.chr (#ichar ch)) end \| #"\"" => let val inp = Inp.push_back_ch (ch, inp) in scan_string_literal inp end \| #"'" => let val inp = Inp.push_back_ch (ch, inp) in scan_character_literal inp end \| _ => if is_digit (#ichar ch) then let val inp = Inp.push_back_ch (ch, inp) in scan_integer_literal inp end else if is_ident_start (#ichar ch) then let val inp = Inp.push_back_ch (ch, inp) in scan_identifier_or_reserved_word inp end else raise Unexpected_character (#line_no ch, #column_no ch, Char.chr (#ichar ch)) end fun output_integer_rightjust (outf, num) = (if num < 10 then TextIO.output (outf, " ") else if num < 100 then TextIO.output (outf, " ") else if num < 1000 then TextIO.output (outf, " ") else if num < 10000 then TextIO.output (outf, " ") else (); TextIO.output (outf, Int.toString num)) fun print_token (outf, toktup) = let val (token, arg, line_no, column_no) = toktup val name = token_name token val (padding, str) = if token = token_IDENTIFIER then (" ", arg) else if token = token_INTEGER then (" ", arg) else if token = token_STRING then (" ", arg) else("", "") in output_integer_rightjust (outf, line_no); TextIO.output (outf, " "); output_integer_rightjust (outf, column_no); TextIO.output (outf, " "); TextIO.output (outf, name); TextIO.output (outf, padding); TextIO.output (outf, str); TextIO.output (outf, "\n") end fun scan_text (outf, inp) = let fun loop inp = let val (toktup, inp) = get_next_token inp in (print_token (outf, toktup); let val (token, _, _, _) = toktup in if token <> token_END_OF_INPUT then loop inp else () end) end in loop inp end (------------------------------------------------------------------) fun main () = let val args = CommandLine.arguments () val (inpf_filename, outf_filename) = case args of [] => ("-", "-") \| name :: [] => (name, "-") \| name1 :: name2 :: _ => (name1, name2) val inpf = if inpf_filename = "-" then TextIO.stdIn else TextIO.openIn inpf_filename handle (IO.Io _) => (TextIO.output (TextIO.stdErr, "Failure opening \""); TextIO.output (TextIO.stdErr, inpf_filename); TextIO.output (TextIO.stdErr, "\" for input\n"); OS.Process.exit OS.Process.failure) val outf = if outf_filename = "-" then TextIO.stdOut else TextIO.openOut outf_filename handle (IO.Io _) => (TextIO.output (TextIO.stdErr, "Failure opening \""); TextIO.output (TextIO.stdErr, outf_filename); TextIO.output (TextIO.stdErr, "\" for output\n"); OS.Process.exit OS.Process.failure) val inp = Inp.of_instream inpf in scan_text (outf, inp) end handle Unterminated_comment (line_no, column_no) => (TextIO.output (TextIO.stdErr, ": unterminated comment "); TextIO.output (TextIO.stdErr, "starting at "); TextIO.output (TextIO.stdErr, Int.toString line_no); TextIO.output (TextIO.stdErr, ":"); TextIO.output (TextIO.stdErr, Int.toString column_no); TextIO.output (TextIO.stdErr, "\n"); OS.Process.exit OS.Process.failure) \| Unterminated_character_literal (line_no, column_no) => (TextIO.output (TextIO.stdErr, ": unterminated character "); TextIO.output (TextIO.stdErr, "literal starting at "); TextIO.output (TextIO.stdErr, Int.toString line_no); TextIO.output (TextIO.stdErr, ":"); TextIO.output (TextIO.stdErr, Int.toString column_no); TextIO.output (TextIO.stdErr, "\n"); OS.Process.exit OS.Process.failure) \| Multicharacter_literal (line_no, column_no) => (TextIO.output (TextIO.stdErr, ": unsupported multicharacter"); TextIO.output (TextIO.stdErr, " literal starting at "); TextIO.output (TextIO.stdErr, Int.toString line_no); TextIO.output (TextIO.stdErr, ":"); TextIO.output (TextIO.stdErr, Int.toString column_no); TextIO.output (TextIO.stdErr, "\n"); OS.Process.exit OS.Process.failure) \| End_of_input_in_string_literal (line_no, column_no) => (TextIO.output (TextIO.stdErr, ": end of input in string"); TextIO.output (TextIO.stdErr, " literal starting at "); TextIO.output (TextIO.stdErr, Int.toString line_no); TextIO.output (TextIO.stdErr, ":"); TextIO.output (TextIO.stdErr, Int.toString column_no); TextIO.output (TextIO.stdErr, "\n"); OS.Process.exit OS.Process.failure) \| End_of_line_in_string_literal (line_no, column_no) => (TextIO.output (TextIO.stdErr, ": end of line in string"); TextIO.output (TextIO.stdErr, " literal starting at "); TextIO.output (TextIO.stdErr, Int.toString line_no); TextIO.output (TextIO.stdErr, ":"); TextIO.output (TextIO.stdErr, Int.toString column_no); TextIO.output (TextIO.stdErr, "\n"); OS.Process.exit OS.Process.failure) \| Unsupported_escape (line_no, column_no, c) => (TextIO.output (TextIO.stdErr, CommandLine.name ()); TextIO.output (TextIO.stdErr, ": unsupported escape \\"); TextIO.output (TextIO.stdErr, Char.toString c); TextIO.output (TextIO.stdErr, " at "); TextIO.output (TextIO.stdErr, Int.toString line_no); TextIO.output (TextIO.stdErr, ":"); TextIO.output (TextIO.stdErr, Int.toString column_no); TextIO.output (TextIO.stdErr, "\n"); OS.Process.exit OS.Process.failure) \| Invalid_integer_literal (line_no, column_no, str) => (TextIO.output (TextIO.stdErr, CommandLine.name ()); TextIO.output (TextIO.stdErr, ": invalid integer literal "); TextIO.output (TextIO.stdErr, str); TextIO.output (TextIO.stdErr, " at "); TextIO.output (TextIO.stdErr, Int.toString line_no); TextIO.output (TextIO.stdErr, ":"); TextIO.output (TextIO.stdErr, Int.toString column_no); TextIO.output (TextIO.stdErr, "\n"); OS.Process.exit OS.Process.failure) \| Unexpected_character (line_no, column_no, c) => (TextIO.output (TextIO.stdErr, CommandLine.name ()); TextIO.output (TextIO.stdErr, ": unexpected character '"); TextIO.output (TextIO.stdErr, Char.toString c); TextIO.output (TextIO.stdErr, "' at "); TextIO.output (TextIO.stdErr, Int.toString line_no); TextIO.output (TextIO.stdErr, ":"); TextIO.output (TextIO.stdErr, Int.toString column_no); TextIO.output (TextIO.stdErr, "\n"); OS.Process.exit OS.Process.failure); (------------------------------------------------------------------) ( For the Mlton compiler, include the following. For Poly/ML, comment it out. ) main (); (------------------------------------------------------------------) ( Instructions for GNU Emacs. ) ( local variables: ) ( mode: sml ) ( sml-indent-level: 2 ) ( sml-indent-args: 2 ) ( end: ) (------------------------------------------------------------------*)</syntaxhighlight> {{out}} For Mlton, compile with <pre>mlton -output lex lex.sml</pre> For Poly/ML, compile with <pre>polyc -o lex lex.sml</pre> Mlton is an optimizing whole-program compiler. It might take much longer to compile the source but produce much faster executables. Output for testcase3: <pre> 5 16 Keyword_print 5 40 Op_subtract 6 16 Keyword_putc 6 40 Op_less 7 16 Keyword_if 7 40 Op_greater 8 16 Keyword_else 8 40 Op_lessequal 9 16 Keyword_while 9 40 Op_greaterequal 10 16 LeftBrace 10 40 Op_equal 11 16 RightBrace 11 40 Op_notequal 12 16 LeftParen 12 40 Op_and 13 16 RightParen 13 40 Op_or 14 16 Op_subtract 14 40 Semicolon 15 16 Op_not 15 40 Comma 16 16 Op_multiply 16 40 Op_assign 17 16 Op_divide 17 40 Integer 42 18 16 Op_mod 18 40 String "String literal" 19 16 Op_add 19 40 Identifier variable_name 20 26 Integer 10 21 26 Integer 92 22 26 Integer 32 23 1 End_of_input</pre> =={{header\|Wren}}== Line 12,172 ⟶ 18,676: {{libheader\|Wren-fmt}} {{libheader\|Wren-ioutil}} <~~lang~~syntaxhighlight ~~ecmascript~~lang="wren">import "./dynamic" for Enum, Struct, Tuple import "./str" for Char import "./fmt" for Fmt import "./ioutil" for FileUtil import "os" for Process Line 12,521 ⟶ 19,025: lineCount = lines.count initLex.call() process.call()</~~lang~~syntaxhighlight> {{out}} Line 12,563 ⟶ 19,067: =={{header\|Zig}}== <~~lang~~syntaxhighlight lang="zig"> const std = @import("std"); Line 12,972 ⟶ 19,476: return result.items; } </syntaxhighlight> ~~</lang>~~