Compiler/lexical analyzer: Difference between revisions

m
m (→‎{{header|Wren}}: Minor tidy)
 
(21 intermediate revisions by 5 users not shown)
Line 158:
For example, the following two program fragments are equivalent, and should produce the same token stream except for the line and column positions:
 
* <langsyntaxhighlight lang="c">if ( p /* meaning n is prime */ ) {
print ( n , " " ) ;
count = count + 1 ; /* number of primes found so far */
}</langsyntaxhighlight>
* <langsyntaxhighlight lang="c">if(p){print(n," ");count=count+1;}</langsyntaxhighlight>
 
;Complete list of token names
Line 237:
| style="vertical-align:top" |
Test Case 1:
<langsyntaxhighlight lang="c">/*
Hello world
*/
print("Hello, World!\n");</langsyntaxhighlight>
 
| style="vertical-align:top" |
Line 255:
| style="vertical-align:top" |
Test Case 2:
<langsyntaxhighlight lang="c">/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, "\n");</langsyntaxhighlight>
 
| style="vertical-align:top" |
Line 280:
| style="vertical-align:top" |
Test Case 3:
<langsyntaxhighlight lang="c">/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
Line 301:
/* character literal */ '\n'
/* character literal */ '\\'
/* character literal */ ' '</langsyntaxhighlight>
 
| style="vertical-align:top" |
Line 344:
| style="vertical-align:top" |
Test Case 4:
<langsyntaxhighlight lang="c">/*** test printing, embedded \n and comments with lots of '*' ***/
print(42);
print("\nHello World\nGood Bye\nok\n");
print("Print a slash n - \\n.\n");</langsyntaxhighlight>
 
| style="vertical-align:top" |
Line 388:
 
=={{header|Ada}}==
<langsyntaxhighlight lang="ada">with Ada.Text_IO, Ada.Streams.Stream_IO, Ada.Strings.Unbounded, Ada.Command_Line,
Ada.Exceptions;
use Ada.Strings, Ada.Strings.Unbounded, Ada.Streams, Ada.Exceptions;
Line 648:
when error : others => IO.Put_Line("Error: " & Exception_Message(error));
end Main;
</syntaxhighlight>
</lang>
{{out}} Test case 3:
<pre>
Line 686:
23 1 END_OF_INPUT
</pre>
 
=={{header|ALGOL 68}}==
This is a simple ''token in, line out'' program. It doesn't keep an internal representation of tokens or anything like that, since that's not needed at all.
 
As an addition, it emits a diagnostic if integer literals are too big.
<syntaxhighlight lang="algol68">BEGIN
# implement C-like getchar, where EOF and EOLn are "characters" (-1 and 10 resp.). #
INT eof = -1, eoln = 10;
BOOL eof flag := FALSE;
STRING buf := "";
INT col := 1;
INT line := 0;
on logical file end (stand in, (REF FILE f)BOOL: eof flag := TRUE);
PROC getchar = INT:
IF eof flag THEN eof
ELIF col = UPB buf THEN col +:= 1; eoln
ELIF col > UPB buf THEN IF line > 0 THEN read(newline) FI;
line +:= 1;
read(buf);
IF eof flag THEN col := 1; eof
ELSE col := 0; getchar
FI
ELSE col +:= 1; ABS buf[col]
FI;
PROC nextchar = INT: IF eof flag THEN eof ELIF col >= UPB buf THEN eoln ELSE ABS buf[col+1] FI;
 
PROC is blank = (INT ch) BOOL: ch = 0 OR ch = 9 OR ch = 10 OR ch = 13 OR ch = ABS " ";
PROC is digit = (INT ch) BOOL: ch >= ABS "0" AND ch <= ABS "9";
PROC is ident start = (INT ch) BOOL: ch >= ABS "A" AND ch <= ABS "Z" OR
ch >= ABS "a" AND ch <= ABS "z" OR
ch = ABS "_";
PROC is ident = (INT ch) BOOL: is ident start(ch) OR is digit(ch);
 
PROC ident or keyword = (INT start char) VOID:
BEGIN
STRING w := REPR start char;
INT start col = col;
WHILE is ident (next char) DO w +:= REPR getchar OD;
IF w = "if" THEN output2("Keyword_if", start col)
ELIF w = "else" THEN output2("Keyword_else", start col)
ELIF w = "while" THEN output2("Keyword_while", start col)
ELIF w = "print" THEN output2("Keyword_print", start col)
ELIF w = "putc" THEN output2("Keyword_putc", start col)
ELSE output2("Identifier " + w, start col)
FI
END;
PROC char = VOID:
BEGIN
INT start col = col;
INT ch := getchar;
IF ch = ABS "'" THEN error("Empty character constant")
ELIF ch = ABS "\" THEN ch := getchar;
IF ch = ABS "n" THEN ch := 10
ELIF ch = ABS "\" THEN SKIP
ELSE error("Unknown escape sequence. \" + REPR ch)
FI
FI;
IF nextchar /= ABS "'" THEN error("Multi-character constant.") FI;
getchar;
output2("Integer " + whole(ch, 0), start col)
END;
PROC string = VOID:
BEGIN
INT start col = col;
STRING s := """";
WHILE INT ch := getchar; ch /= ABS """"
DO
IF ch = eoln THEN error("End-of-line while scanning string literal. Closing string character not found before end-of-line.")
ELIF ch = eof THEN error("End-of-file while scanning string literal. Closing string character not found.")
ELIF ch = ABS "\" THEN s +:= REPR ch; ch := getchar;
IF ch /= ABS "\" AND ch /= ABS "n" THEN error("Unknown escape sequence. \" + REPR ch) FI;
s +:= REPR ch
ELSE s +:= REPR ch
FI
OD;
output2("String " + s + """", start col)
END;
PROC comment = VOID:
BEGIN
WHILE INT ch := getchar; NOT (ch = ABS "*" AND nextchar = ABS "/")
DO IF ch = eof THEN error("End-of-file in comment. Closing comment characters not found.") FI
OD;
getchar
END;
PROC number = (INT first digit) VOID:
BEGIN
INT start col = col;
INT n := first digit - ABS "0";
WHILE is digit (nextchar) DO
INT u := getchar - ABS "0";
IF LENG n * 10 + LENG u > max int THEN error("Integer too big") FI;
n := n * 10 + u
OD;
IF is ident start (nextchar) THEN error("Invalid number. Starts like a number, but ends in non-numeric characters.") FI;
output2("Integer " + whole(n, 0), start col)
END;
 
PROC output = (STRING s) VOID: output2(s, col);
PROC output2 = (STRING s, INT col) VOID: print((whole(line,-8), whole(col,-8), " ", s, newline));
 
PROC if follows = (CHAR second, STRING longer, shorter) VOID:
IF nextchar = ABS second
THEN output(longer); getchar
ELSE output(shorter)
FI;
PROC error = (STRING s)VOID: (put(stand error, ("At ", whole(line,0), ":", whole(col,0), " ", s, new line)); stop);
PROC unrecognized = (INT char) VOID: error("Unrecognized character " + REPR char);
PROC double char = (INT first, STRING op) VOID:
IF nextchar /= first THEN unrecognized(first)
ELSE output2(op, col-1); getchar
FI;
 
WHILE INT ch := getchar; ch /= eof
DO
IF is blank(ch) THEN SKIP
ELIF ch = ABS "(" THEN output("LeftParen")
ELIF ch = ABS ")" THEN output("RightParen")
ELIF ch = ABS "{" THEN output("LeftBrace")
ELIF ch = ABS "}" THEN output("RightBrace")
ELIF ch = ABS ";" THEN output("Semicolon")
ELIF ch = ABS "," THEN output("Comma")
ELIF ch = ABS "*" THEN output("Op_multiply")
ELIF ch = ABS "/" THEN IF next char = ABS "*" THEN comment
ELSE output("Op_divide")
FI
ELIF ch = ABS "%" THEN output("Op_mod")
ELIF ch = ABS "+" THEN output("Op_add")
ELIF ch = ABS "-" THEN output("Op_subtract")
ELIF ch = ABS "<" THEN if follows("=", "Op_lessequal", "Op_less")
ELIF ch = ABS ">" THEN if follows("=", "Op_greaterequal", "Op_greater")
ELIF ch = ABS "=" THEN if follows("=", "Op_equal", "Op_assign")
ELIF ch = ABS "!" THEN if follows("=", "Op_notequal", "Op_not")
ELIF ch = ABS "&" THEN double char(ch, "Op_and")
ELIF ch = ABS "|" THEN double char(ch, "Op_or")
ELIF is ident start (ch) THEN ident or keyword (ch)
ELIF ch = ABS """" THEN string
ELIF ch = ABS "'" THEN char
ELIF is digit(ch) THEN number(ch)
ELSE unrecognized(ch)
FI
OD;
output("End_Of_Input")
END</syntaxhighlight>
 
=={{header|ALGOL W}}==
<langsyntaxhighlight lang="algolw">begin
%lexical analyser %
% Algol W strings are limited to 256 characters in length so we limit source lines %
Line 981 ⟶ 1,124:
while nextToken not = tEnd_of_input do writeToken;
writeToken
end.</langsyntaxhighlight>
{{out}} Test case 3:
<pre>
Line 1,026 ⟶ 1,169:
(One point of note: the C "EOF" pseudo-character is detected in the following code by looking for a negative number. That EOF has to be negative and the other characters non-negative is implied by the ISO C standard.)
 
<langsyntaxhighlight ATSlang="ats">(********************************************************************)
(* Usage: lex [INPUTFILE [OUTPUTFILE]]
If INPUTFILE or OUTPUTFILE is "-" or missing, then standard input
Line 1,898 ⟶ 2,041:
end
 
(********************************************************************)</langsyntaxhighlight>
 
{{out}}
Line 1,939 ⟶ 2,082:
=={{header|AWK}}==
Tested with gawk 4.1.1 and mawk 1.3.4.
<syntaxhighlight lang="awk">
<lang AWK>
BEGIN {
all_syms["tk_EOI" ] = "End_of_input"
Line 2,145 ⟶ 2,288:
}
}
</syntaxhighlight>
</lang>
{{out|case=count}}
<b>
Line 2,182 ⟶ 2,325:
=={{header|C}}==
Tested with gcc 4.81 and later, compiles warning free with -Wpedantic -pedantic -Wall -Wextra
<langsyntaxhighlight Clang="c">#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
Line 2,414 ⟶ 2,557:
run();
return 0;
}</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 2,458 ⟶ 2,601:
=={{header|C sharp|C#}}==
Requires C#6.0 because of the use of null coalescing operators.
<langsyntaxhighlight lang="csharp">
using System;
using System.IO;
Line 2,808 ⟶ 2,951:
}
}
</syntaxhighlight>
</lang>
 
{{out|case=test case 3}}
Line 2,852 ⟶ 2,995:
=={{header|C++}}==
Tested with GCC 9.3.0 (g++ -std=c++17)
<langsyntaxhighlight lang="cpp">#include <charconv> // std::from_chars
#include <fstream> // file_to_string, string_to_file
#include <functional> // std::invoke
Line 3,237 ⟶ 3,380:
});
}
</syntaxhighlight>
</lang>
 
{{out|case=test case 3}}
Line 3,282 ⟶ 3,425:
Using GnuCOBOL 2. By Steve Williams (with one change to get around a Rosetta Code code highlighter problem).
 
<langsyntaxhighlight lang="cobol"> >>SOURCE FORMAT IS FREE
*> this code is dedicated to the public domain
*> (GnuCOBOL) 2.3-dev.0
Line 3,688 ⟶ 3,831:
end-if
.
end program lexer.</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 3,730 ⟶ 3,873:
Lisp has a built-in reader and you can customize the reader by modifying its readtable. I'm also using the Gray stream, which is an almost standard feature of Common Lisp, for counting lines and columns.
 
<langsyntaxhighlight lang="lisp">(defpackage #:lexical-analyzer
(:use #:cl #:sb-gray)
(:export #:main))
Line 3,943 ⟶ 4,086:
 
(defun main ()
(lex *standard-input*))</langsyntaxhighlight>
{{out|case=test case 3}}
<pre> 5 16 KEYWORD-PRINT
Line 3,984 ⟶ 4,127:
{{trans|ATS}}
 
<langsyntaxhighlight Elixirlang="elixir">#!/bin/env elixir
# -*- elixir -*-
 
Line 4,452 ⟶ 4,595:
end ## module Lex
 
Lex.main(System.argv)</langsyntaxhighlight>
 
{{out}}
Line 4,495 ⟶ 4,638:
=={{header|Emacs Lisp}}==
{{works with|Emacs|GNU 27.2}}
{{trans|ATS}}
 
 
<langsyntaxhighlight lang="lisp">#!/usr/bin/emacs --script
;;
;; The Rosetta Code lexical analyzer in GNU Emacs Lisp.
Line 4,915 ⟶ 5,059:
(scan-text t))
 
(main)</langsyntaxhighlight>
 
 
Line 4,961 ⟶ 5,105:
 
 
<langsyntaxhighlight lang="erlang">#!/bin/env escript
%%%-------------------------------------------------------------------
 
Line 5,466 ⟶ 5,610:
%%% erlang-indent-level: 3
%%% end:
%%%-------------------------------------------------------------------</langsyntaxhighlight>
 
 
Line 5,508 ⟶ 5,652:
=={{header|Euphoria}}==
Tested with Euphoria 4.05.
<langsyntaxhighlight lang="euphoria">include std/io.e
include std/map.e
include std/types.e
Line 5,733 ⟶ 5,877:
end procedure
 
main(command_line())</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 5,777 ⟶ 5,921:
=={{header|Flex}}==
Tested with Flex 2.5.4.
<syntaxhighlight lang="c">%{
<lang C>%{
#include <stdio.h>
#include <stdlib.h>
Line 5,950 ⟶ 6,094:
} while (tok != tk_EOI);
return 0;
}</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 5,994 ⟶ 6,138:
=={{header|Forth}}==
Tested with Gforth 0.7.3.
<langsyntaxhighlight Forthlang="forth">CREATE BUF 0 , \ single-character look-ahead buffer
CREATE COLUMN# 0 ,
CREATE LINE# 1 ,
Line 6,116 ⟶ 6,260:
THEN THEN ;
: TOKENIZE BEGIN CONSUME AGAIN ;
TOKENIZE</langsyntaxhighlight>
 
{{out}}
Line 6,130 ⟶ 6,274:
 
The author has placed this Fortran code in the public domain.
<syntaxhighlight lang="fortran">!!!
<lang Fortran>!!!
!!! An implementation of the Rosetta Code lexical analyzer task:
!!! https://rosettacode.org/wiki/Compiler/lexical_analyzer
Line 7,208 ⟶ 7,352:
end subroutine print_usage
end program lex</langsyntaxhighlight>
 
{{out}}
Line 7,249 ⟶ 7,393:
=={{header|FreeBASIC}}==
Tested with FreeBASIC 1.05
<langsyntaxhighlight FreeBASIClang="freebasic">enum Token_type
tk_EOI
tk_Mul
Line 7,535 ⟶ 7,679:
print : print "Hit any to end program"
sleep
system</langsyntaxhighlight>
{{out|case=test case 3}}
<b>
Line 7,576 ⟶ 7,720:
=={{header|Go}}==
{{trans|FreeBASIC}}
<langsyntaxhighlight lang="go">package main
 
import (
Line 7,953 ⟶ 8,097:
initLex()
process()
}</langsyntaxhighlight>
 
{{out}}
Line 7,996 ⟶ 8,140:
=={{header|Haskell}}==
Tested with GHC 8.0.2
<langsyntaxhighlight lang="haskell">import Control.Applicative hiding (many, some)
import Control.Monad.State.Lazy
import Control.Monad.Trans.Maybe (MaybeT, runMaybeT)
Line 8,300 ⟶ 8,444:
where (Just t, s') = runState (runMaybeT lexer) s
(txt, _, _) = s'
</syntaxhighlight>
</lang>
 
{{out|case=test case 3}}
Line 8,352 ⟶ 8,496:
Global variables are avoided except for some constants that require initialization.
 
<syntaxhighlight lang="icon">#
<lang Icon>#
# The Rosetta Code lexical analyzer in Icon with co-expressions. Based
# upon the ATS implementation.
Line 8,587 ⟶ 8,731:
toktup := scan_character_literal_without_checking_end(inp)
ch1 := @inp
while EOF ~=== ch1[1] &if ch1[1] ~=== close_quote dothen {
case ch1[1] ofrepeat {
case ch1[1] of {
EOF : unterminated_character_literal(line_no, column_no)
close_quote EOF : multicharacter_literalunterminated_character_literal(line_no, column_no)
close_quote : multicharacter_literal(line_no, column_no)
default : ch1 := @inp
default : ch1 := @inp
}
}
}
Line 8,677 ⟶ 8,823:
if ch[1] == "*" then {
ch1 := @inp
(EOF === chch1[1]) & unterminated_comment(line_no, column_no)
}
}
Line 8,848 ⟶ 8,994:
procedure max(x, y)
return (if x < y then y else x)
end</langsyntaxhighlight>
 
 
Line 8,897 ⟶ 9,043:
Implementation:
 
<langsyntaxhighlight Jlang="j">symbols=:256#0
ch=: {{1 0+x[symbols=: x (a.i.y)} symbols}}
'T0 token' =: 0 ch '%+-!(){};,<>=!|&'
Line 9,017 ⟶ 9,163:
keep=. (tokens~:<,'''')*-.comments+.whitespace+.unknown*a:=values
keep&#each ((1+lines),.columns);<names,.values
}}</langsyntaxhighlight>
 
Test case 3:
 
<syntaxhighlight lang="j">
<lang J>
flex=: {{
'A B'=.y
Line 9,087 ⟶ 9,233:
21 28 Integer 92
22 27 Integer 32
23 1 End_of_input </langsyntaxhighlight>
 
Here, it seems expedient to retain a structured representation of the lexical result. As shown, it's straightforward to produce a "pure" textual result for a hypothetical alternative implementation of the syntax analyzer, but the structured representation will be easier to deal with.
 
=={{header|Java}}==
<langsyntaxhighlight lang="java">
// Translated from python source
 
Line 9,242 ⟶ 9,388:
if (text.equals("")) {
error(line, pos, String.format("identifer_or_integer unrecopgnizedunrecognized character: (%d) %c", (int)this.chr, this.chr));
}
if (Character.isDigit(text.charAt(0))) {
if (!is_number) {
error(line, pos, String.format("invaslidinvalid number: %s", text));
}
return new Token(TokenType.Integer, text, line, pos);
Line 9,333 ⟶ 9,479:
}
}
</syntaxhighlight>
</lang>
 
=={{header|JavaScript}}==
{{incorrect|Javascript|Please show output. Code is identical to [[Compiler/syntax_analyzer]] task}}
<langsyntaxhighlight lang="javascript">
/*
Token: type, value, line, pos
Line 9,550 ⟶ 9,696:
l.printTokens()
})
</syntaxhighlight>
</lang>
 
=={{header|Julia}}==
<langsyntaxhighlight lang="julia">struct Tokenized
startline::Int
startcol::Int
Line 9,708 ⟶ 9,854:
println(lpad(tok.startline, 3), lpad(tok.startcol, 5), lpad(tok.name, 18), " ", tok.value != nothing ? tok.value : "")
end
</langsyntaxhighlight>{{output}}<pre>
Line Col Name Value
5 16 Keyword_print
Line 9,745 ⟶ 9,891:
23 1 End_of_input
</pre>
=={{header|kotlin}}==
{{trans|Java}}
<syntaxhighlight lang="kotlin">// Input: command line argument of file to process or console input. A two or
// three character console input of digits followed by a new line will be
// checked for an integer between zero and twenty-five to select a fixed test
// case to run. Any other console input will be parsed.
 
// Code based on the Java version found here:
// https://rosettacode.org/mw/index.php?title=Compiler/lexical_analyzer&action=edit&section=22
 
// Class to halt the parsing with an exception.
class ParsingFailed(message: String): Exception(message)
 
// Enumerate class of tokens supported by this scanner.
enum class TokenType {
Tk_End_of_input, Op_multiply, Op_divide, Op_mod, Op_add, Op_subtract,
Op_negate, Op_not, Op_less, Op_lessequal, Op_greater, Op_greaterequal,
Op_equal, Op_notequal, Op_assign, Op_and, Op_or, Kw_if,
Kw_else, Kw_while, Kw_print, Kw_putc, Sy_LeftParen, Sy_RightParen,
Sy_LeftBrace, Sy_RightBrace, Sy_Semicolon, Sy_Comma, Tk_Identifier,
Tk_Integer, Tk_String;
 
override fun toString() =
listOf("End_of_input", "Op_multiply", "Op_divide", "Op_mod", "Op_add",
"Op_subtract", "Op_negate", "Op_not", "Op_less", "Op_lessequal",
"Op_greater", "Op_greaterequal", "Op_equal", "Op_notequal",
"Op_assign", "Op_and", "Op_or", "Keyword_if", "Keyword_else",
"Keyword_while", "Keyword_print", "Keyword_putc", "LeftParen",
"RightParen", "LeftBrace", "RightBrace", "Semicolon", "Comma",
"Identifier", "Integer", "String")[this.ordinal]
} // TokenType
 
// Data class of tokens returned by the scanner.
data class Token(val token: TokenType, val value: String, val line: Int,
val pos: Int) {
 
// Overridden method to display the token.
override fun toString() =
"%5d %5d %-15s %s".format(line, pos, this.token,
when (this.token) {
TokenType.Tk_Integer, TokenType.Tk_Identifier ->
" %s".format(this.value)
TokenType.Tk_String ->
this.value.toList().joinToString("", " \"", "\"") {
when (it) {
'\t' ->
"\\t"
'\n' ->
"\\n"
'\u000b' ->
"\\v"
'\u000c' ->
"\\f"
'\r' ->
"\\r"
'"' ->
"\\\""
'\\' ->
"\\"
in ' '..'~' ->
"$it"
else ->
"\\u%04x".format(it.code) } }
else ->
"" } )
} // Token
 
// Function to display an error message and halt the scanner.
fun error(line: Int, pos: Int, msg: String): Nothing =
throw ParsingFailed("(%d, %d) %s\n".format(line, pos, msg))
 
// Class to process the source into tokens with properties of the
// source string, the line number, the column position, the index
// within the source string, the current character being processed,
// and map of the keyword strings to the corresponding token type.
class Lexer(private val s: String) {
private var line = 1
private var pos = 1
private var position = 0
private var chr =
if (s.isEmpty())
' '
else
s[0]
private val keywords = mapOf<String, TokenType>(
"if" to TokenType.Kw_if,
"else" to TokenType.Kw_else,
"print" to TokenType.Kw_print,
"putc" to TokenType.Kw_putc,
"while" to TokenType.Kw_while)
 
// Method to retrive the next character from the source. Use null after
// the end of our source.
private fun getNextChar() =
if (++this.position >= this.s.length) {
this.pos++
this.chr = '\u0000'
this.chr
} else {
this.pos++
this.chr = this.s[this.position]
when (this.chr) {
'\n' -> {
this.line++
this.pos = 0
} // line
'\t' ->
while (this.pos%8 != 1)
this.pos++
} // when
this.chr
} // if
 
// Method to return the division token, skip the comment, or handle the
// error.
private fun div_or_comment(line: Int, pos: Int): Token =
if (getNextChar() != '*')
Token(TokenType.Op_divide, "", line, pos);
else {
getNextChar() // Skip comment start
outer@
while (true)
when (this.chr) {
'\u0000' ->
error(line, pos, "Lexer: EOF in comment");
'*' ->
if (getNextChar() == '/') {
getNextChar() // Skip comment end
break@outer
} // if
else ->
getNextChar()
} // when
getToken()
} // if
 
// Method to verify a character literal. Return the token or handle the
// error.
private fun char_lit(line: Int, pos: Int): Token {
var c = getNextChar() // skip opening quote
when (c) {
'\'' ->
error(line, pos, "Lexer: Empty character constant");
'\\' ->
c = when (getNextChar()) {
'n' ->
10.toChar()
'\\' ->
'\\'
'\'' ->
'\''
else ->
error(line, pos, "Lexer: Unknown escape sequence '\\%c'".
format(this.chr)) }
} // when
if (getNextChar() != '\'')
error(line, pos, "Lexer: Multi-character constant")
getNextChar() // Skip closing quote
return Token(TokenType.Tk_Integer, c.code.toString(), line, pos)
} // char_lit
 
// Method to check next character to see whether it belongs to the token
// we might be in the middle of. Return the correct token or handle the
// error.
private fun follow(expect: Char, ifyes: TokenType, ifno: TokenType,
line: Int, pos: Int): Token =
when {
getNextChar() == expect -> {
getNextChar()
Token(ifyes, "", line, pos)
} // matches
ifno == TokenType.Tk_End_of_input ->
error(line, pos,
"Lexer: %c expected: (%d) '%c'".format(expect,
this.chr.code, this.chr))
else ->
Token(ifno, "", line, pos)
} // when
 
// Method to verify a character string. Return the token or handle the
// error.
private fun string_lit(start: Char, line: Int, pos: Int): Token {
var result = ""
while (getNextChar() != start)
when (this.chr) {
'\u0000' ->
error(line, pos, "Lexer: EOF while scanning string literal")
'\n' ->
error(line, pos, "Lexer: EOL while scanning string literal")
'\\' ->
when (getNextChar()) {
'\\' ->
result += '\\'
'n' ->
result += '\n'
'"' ->
result += '"'
else ->
error(line, pos, "Lexer: Escape sequence unknown '\\%c'".
format(this.chr))
} // when
else ->
result += this.chr
} // when
getNextChar() // Toss closing quote
return Token(TokenType.Tk_String, result, line, pos)
} // string_lit
 
// Method to retrive an identifier or integer. Return the keyword
// token, if the string matches one. Return the integer token,
// if the string is all digits. Return the identifer token, if the
// string is valid. Otherwise, handle the error.
private fun identifier_or_integer(line: Int, pos: Int): Token {
var is_number = true
var text = ""
while (this.chr in listOf('_')+('0'..'9')+('a'..'z')+('A'..'Z')) {
text += this.chr
is_number = is_number && this.chr in '0'..'9'
getNextChar()
} // while
if (text.isEmpty())
error(line, pos, "Lexer: Unrecognized character: (%d) %c".
format(this.chr.code, this.chr))
return when {
text[0] in '0'..'9' ->
if (!is_number)
error(line, pos, "Lexer: Invalid number: %s".
format(text))
else {
val max = Int.MAX_VALUE.toString()
if (text.length > max.length || (text.length == max.length &&
max < text))
error(line, pos,
"Lexer: Number exceeds maximum value %s".
format(text))
Token(TokenType.Tk_Integer, text, line, pos)
} // if
this.keywords.containsKey(text) ->
Token(this.keywords[text]!!, "", line, pos)
else ->
Token(TokenType.Tk_Identifier, text, line, pos) }
} // identifier_or_integer
 
// Method to skip whitespace both C's and Unicode ones and retrive the next
// token.
private fun getToken(): Token {
while (this.chr in listOf('\t', '\n', '\u000b', '\u000c', '\r', ' ') ||
this.chr.isWhitespace())
getNextChar()
val line = this.line
val pos = this.pos
return when (this.chr) {
'\u0000' ->
Token(TokenType.Tk_End_of_input, "", line, pos)
'/' ->
div_or_comment(line, pos)
'\'' ->
char_lit(line, pos)
'<' ->
follow('=', TokenType.Op_lessequal, TokenType.Op_less, line, pos)
'>' ->
follow('=', TokenType.Op_greaterequal, TokenType.Op_greater, line, pos)
'=' ->
follow('=', TokenType.Op_equal, TokenType.Op_assign, line, pos)
'!' ->
follow('=', TokenType.Op_notequal, TokenType.Op_not, line, pos)
'&' ->
follow('&', TokenType.Op_and, TokenType.Tk_End_of_input, line, pos)
'|' ->
follow('|', TokenType.Op_or, TokenType.Tk_End_of_input, line, pos)
'"' ->
string_lit(this.chr, line, pos)
'{' -> {
getNextChar()
Token(TokenType.Sy_LeftBrace, "", line, pos)
} // open brace
'}' -> {
getNextChar()
Token(TokenType.Sy_RightBrace, "", line, pos)
} // close brace
'(' -> {
getNextChar()
Token(TokenType.Sy_LeftParen, "", line, pos)
} // open paren
')' -> {
getNextChar()
Token(TokenType.Sy_RightParen, "", line, pos)
} // close paren
'+' -> {
getNextChar()
Token(TokenType.Op_add, "", line, pos)
} // plus
'-' -> {
getNextChar()
Token(TokenType.Op_subtract, "", line, pos)
} // dash
'*' -> {
getNextChar()
Token(TokenType.Op_multiply, "", line, pos)
} // asterisk
'%' -> {
getNextChar()
Token(TokenType.Op_mod, "", line, pos)
} // percent
';' -> {
getNextChar()
Token(TokenType.Sy_Semicolon, "", line, pos)
} // semicolon
',' -> {
getNextChar()
Token(TokenType.Sy_Comma, "", line, pos)
} // comma
else ->
identifier_or_integer(line, pos) }
} // getToken
 
// Method to parse and display tokens.
fun printTokens() {
do {
val t: Token = getToken()
println(t)
} while (t.token != TokenType.Tk_End_of_input)
} // printTokens
} // Lexer
 
 
// Function to test all good tests from the website and produce all of the
// error messages this program supports.
fun tests(number: Int) {
 
// Function to generate test case 0 source: Hello World/Text.
fun hello() {
Lexer(
"""/*
Hello world
*/
print("Hello, World!\n");
""").printTokens()
} // hello
 
// Function to generate test case 1 source: Phoenix Number.
fun phoenix() {
Lexer(
"""/*
Show Ident and Integers
*/
phoenix_number = 142857;
print(phoenix_number, "\n");""").printTokens()
} // phoenix
 
// Function to generate test case 2 source: All Symbols.
fun symbols() {
Lexer(
"""/*
All lexical tokens - not syntactically correct, but that will
have to wait until syntax analysis
*/
/* Print */ print /* Sub */ -
/* Putc */ putc /* Lss */ <
/* If */ if /* Gtr */ >
/* Else */ else /* Leq */ <=
/* While */ while /* Geq */ >=
/* Lbrace */ { /* Eq */ ==
/* Rbrace */ } /* Neq */ !=
/* Lparen */ ( /* And */ &&
/* Rparen */ ) /* Or */ ||
/* Uminus */ - /* Semi */ ;
/* Not */ ! /* Comma */ ,
/* Mul */ * /* Assign */ =
/* Div */ / /* Integer */ 42
/* Mod */ % /* String */ "String literal"
/* Add */ + /* Ident */ variable_name
/* character literal */ '\n'
/* character literal */ '\\'
/* character literal */ ' '""").printTokens()
} // symbols
 
// Function to generate test case 3 source: Test Case 4.
fun four() {
Lexer(
"""/*** test printing, embedded \n and comments with lots of '*' ***/
print(42);
print("\nHello World\nGood Bye\nok\n");
print("Print a slash n - \\n.\n");""").printTokens()
} // four
 
// Function to generate test case 4 source: Count.
fun count() {
Lexer(
"""count = 1;
while (count < 10) {
print("count is: ", count, "\n");
count = count + 1;
}""").printTokens()
} // count
 
// Function to generate test case 5 source: 100 Doors.
fun doors() {
Lexer(
"""/* 100 Doors */
i = 1;
while (i * i <= 100) {
print("door ", i * i, " is open\n");
i = i + 1;
}""").printTokens()
} // doors
 
// Function to generate test case 6 source: Negative Tests.
fun negative() {
Lexer(
"""a = (-1 * ((-1 * (5 * 15)) / 10));
print(a, "\n");
b = -a;
print(b, "\n");
print(-b, "\n");
print(-(1), "\n");""").printTokens()
} // negative
 
// Function to generate test case 7 source: Deep.
fun deep() {
Lexer(
"""print(---------------------------------+++5, "\n");
print(((((((((3 + 2) * ((((((2))))))))))))), "\n");
 
if (1) { if (1) { if (1) { if (1) { if (1) { print(15, "\n"); } } } } }""").printTokens()
} // deep
 
// Function to generate test case 8 source: Greatest Common Divisor.
fun gcd() {
Lexer(
"""/* Compute the gcd of 1071, 1029: 21 */
 
a = 1071;
b = 1029;
 
while (b != 0) {
new_a = b;
b = a % b;
a = new_a;
}
print(a);""").printTokens()
} // gcd
 
// Function to generate test case 9 source: Factorial.
fun factorial() {
Lexer(
"""/* 12 factorial is 479001600 */
 
n = 12;
result = 1;
i = 1;
while (i <= n) {
result = result * i;
i = i + 1;
}
print(result);""").printTokens()
} // factorial
 
// Function to generate test case 10 source: Fibonacci Sequence.
fun fibonacci() {
Lexer(
"""/* fibonacci of 44 is 701408733 */
 
n = 44;
i = 1;
a = 0;
b = 1;
while (i < n) {
w = a + b;
a = b;
b = w;
i = i + 1;
}
print(w, "\n");""").printTokens()
} // fibonacci
 
// Function to generate test case 11 source: FizzBuzz.
fun fizzbuzz() {
Lexer(
"""/* FizzBuzz */
i = 1;
while (i <= 100) {
if (!(i % 15))
print("FizzBuzz");
else if (!(i % 3))
print("Fizz");
else if (!(i % 5))
print("Buzz");
else
print(i);
 
print("\n");
i = i + 1;
}""").printTokens()
} // fizzbuzz
 
// Function to generate test case 12 source: 99 Bottles of Beer.
fun bottles() {
Lexer(
"""/* 99 bottles */
bottles = 99;
while (bottles > 0) {
print(bottles, " bottles of beer on the wall\n");
print(bottles, " bottles of beer\n");
print("Take one down, pass it around\n");
bottles = bottles - 1;
print(bottles, " bottles of beer on the wall\n\n");
}""").printTokens()
} // bottles
 
// Function to generate test case 13 source: Primes.
fun primes() {
Lexer(
"""/*
Simple prime number generator
*/
count = 1;
n = 1;
limit = 100;
while (n < limit) {
k=3;
p=1;
n=n+2;
while ((k*k<=n) && (p)) {
p=n/k*k!=n;
k=k+2;
}
if (p) {
print(n, " is prime\n");
count = count + 1;
}
}
print("Total primes found: ", count, "\n");""").printTokens()
} // primes
 
// Function to generate test case 14 source: Ascii Mandelbrot.
fun ascii() {
Lexer(
"""{
/*
This is an integer ascii Mandelbrot generator
*/
left_edge = -420;
right_edge = 300;
top_edge = 300;
bottom_edge = -300;
x_step = 7;
y_step = 15;
 
max_iter = 200;
 
y0 = top_edge;
while (y0 > bottom_edge) {
x0 = left_edge;
while (x0 < right_edge) {
y = 0;
x = 0;
the_char = ' ';
i = 0;
while (i < max_iter) {
x_x = (x * x) / 200;
y_y = (y * y) / 200;
if (x_x + y_y > 800 ) {
the_char = '0' + i;
if (i > 9) {
the_char = '@';
}
i = max_iter;
}
y = x * y / 100 + y0;
x = x_x - y_y + x0;
i = i + 1;
}
putc(the_char);
x0 = x0 + x_step;
}
putc('\n');
y0 = y0 - y_step;
}
}
""").printTokens()
} // ascii
 
when (number) {
0 ->
hello()
1 ->
phoenix()
2 ->
symbols()
3 ->
four()
4 ->
count()
5 ->
doors()
6 ->
negative()
7 ->
deep()
8 ->
gcd()
9 ->
factorial()
10 ->
fibonacci()
11 ->
fizzbuzz()
12 ->
bottles()
13 ->
primes()
14 ->
ascii()
15 -> // Lexer: Empty character constant
Lexer("''").printTokens()
16 -> // Lexer: Unknown escape sequence
Lexer("'\\x").printTokens()
17 -> // Lexer: Multi-character constant
Lexer("' ").printTokens()
18 -> // Lexer: EOF in comment
Lexer("/*").printTokens()
19 -> // Lexer: EOL in string
Lexer("\"\n").printTokens()
20 -> // Lexer: EOF in string
Lexer("\"").printTokens()
21 -> // Lexer: Escape sequence unknown
Lexer("\"\\x").printTokens()
22 -> // Lexer: Unrecognized character
Lexer("~").printTokens()
23 -> // Lexer: invalid number
Lexer("9a9").printTokens()
24 -> // Lexer: Number exceeds maximum value
Lexer("2147483648\n9223372036854775808").printTokens()
25 -> // Lexer: Operator expected
Lexer("|.").printTokens()
else ->
println("Invalid test number %d!".format(number))
} // when
} // tests
 
// Main function to check our source and read its data before parsing it.
// With no source specified, run the test of all symbols.
fun main(args: Array<String>) {
try {
val s =
if (args.size > 0 && args[0].isNotEmpty()) // file on command line
java.util.Scanner(java.io.File(args[0]))
else // use the console
java.util.Scanner(System.`in`)
var source = ""
while (s.hasNext())
source += s.nextLine()+
if (s.hasNext())
"\n"
else
""
if (args.size > 0 && args[0].isNotEmpty()) // file on command line
Lexer(source).printTokens()
else {
val digits = source.filter { it in '0'..'9' }
when {
source.isEmpty() -> // nothing given
tests(2)
source.length in 1..2 && digits.length == source.length &&
digits.toInt() in 0..25 ->
tests(digits.toInt())
else ->
Lexer(source).printTokens()
} // when
} // if
} catch(e: Throwable) {
println(e.message)
System.exit(1)
} // try
} // main</syntaxhighlight>
{{out|case=test case 3: All Symbols}}
<b>
<pre>
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
22 29 End_of_input
</pre>
</b>
 
=={{header|Lua}}==
Line 9,753 ⟶ 10,614:
 
The first module is simply a table defining the names of tokens which don't have an associated value.
<langsyntaxhighlight Lualang="lua">-- module token_name (in a file "token_name.lua")
local token_name = {
['*'] = 'Op_multiply',
Line 9,782 ⟶ 10,643:
['putc'] = 'Keyword_putc',
}
return token_name</langsyntaxhighlight>
 
This module exports a function <i>find_token</i>, which attempts to find the next valid token from a specified position in a source line.
<langsyntaxhighlight Lualang="lua">-- module lpeg_token_finder
local M = {} -- only items added to M will be public (via 'return M' at end)
local table, concat = table, table.concat
Line 9,868 ⟶ 10,729:
end
return M</langsyntaxhighlight>
 
The <i>lexer</i> module uses <i>finder.find_token</i> to produce an iterator over the tokens in a source.
<langsyntaxhighlight Lualang="lua">-- module lexer
local M = {} -- only items added to M will publicly available (via 'return M' at end)
local string, io, coroutine, yield = string, io, coroutine, coroutine.yield
Line 9,950 ⟶ 10,811:
-- M._INTERNALS = _ENV
return M
</syntaxhighlight>
</lang>
 
This script uses <i>lexer.tokenize_text</i> to show the token sequence produced from a source text.
 
<langsyntaxhighlight Lualang="lua">lexer = require 'lexer'
format, gsub = string.format, string.gsub
 
Line 9,992 ⟶ 10,853:
-- etc.
end
</syntaxhighlight>
</lang>
 
===Using only standard libraries===
This version replaces the <i>lpeg_token_finder</i> module of the LPeg version with this <i>basic_token_finder</i> module, altering the <i>require</i> expression near the top of the <i>lexer</i> module accordingly. Tested with Lua 5.3.5. (Note that <i>select</i> is a standard function as of Lua 5.2.)
 
<langsyntaxhighlight lang="lua">-- module basic_token_finder
local M = {} -- only items added to M will be public (via 'return M' at end)
local table, string = table, string
Line 10,127 ⟶ 10,988:
 
-- M._ENV = _ENV
return M</langsyntaxhighlight>
 
=={{header|M2000 Interpreter}}==
<syntaxhighlight lang="m2000 interpreter">
<lang M2000 Interpreter>
Module lexical_analyzer {
a$={/*
Line 10,386 ⟶ 11,247:
}
lexical_analyzer
</syntaxhighlight>
</lang>
 
{{out}}
Line 10,431 ⟶ 11,292:
 
 
<langsyntaxhighlight Mercurylang="mercury">% -*- mercury -*-
%
% Compile with maybe something like:
Line 11,161 ⟶ 12,022:
 
:- func eof = int is det.
eof = -1.</langsyntaxhighlight>
 
{{out}}
Line 11,210 ⟶ 12,071:
Tested with Nim v0.19.4. Both examples are tested against all programs in [[Compiler/Sample programs]].
===Using string with regular expressions===
<langsyntaxhighlight lang="nim">
import re, strformat, strutils
 
Line 11,402 ⟶ 12,263:
 
echo input.tokenize.output
</syntaxhighlight>
</lang>
===Using stream with lexer library===
<langsyntaxhighlight lang="nim">
import lexbase, streams
from strutils import Whitespace
Line 11,715 ⟶ 12,576:
echo &"({l.lineNumber},{l.getColNumber l.bufpos + 1}) {l.error}"
main()
</syntaxhighlight>
</lang>
 
===Using nothing but system and strutils===
<langsyntaxhighlight lang="nim">import strutils
 
type
Line 11,938 ⟶ 12,799:
stdout.write('\n')
if token.kind == tokEnd:
break</langsyntaxhighlight>
 
=={{header|ObjectIcon}}==
Line 11,948 ⟶ 12,809:
 
 
<langsyntaxhighlight ObjectIconlang="objecticon"># -*- ObjectIcon -*-
#
# The Rosetta Code lexical analyzer in Object Icon. Based upon the ATS
Line 12,186 ⟶ 13,047:
toktup := scan_character_literal_without_checking_end(inp)
ch1 := @inp
while EOF ~=== ch1[1] &if ch1[1] ~=== close_quote dothen {
case ch1[1] ofrepeat {
case ch1[1] of {
EOF : unterminated_character_literal(line_no, column_no)
close_quote EOF : multicharacter_literalunterminated_character_literal(line_no, column_no)
close_quote : multicharacter_literal(line_no, column_no)
default : ch1 := @inp
default : ch1 := @inp
}
}
}
Line 12,443 ⟶ 13,306:
write!([FileStream.stderr] ||| args)
exit(1)
end</langsyntaxhighlight>
 
 
Line 12,491 ⟶ 13,354:
(Much of the extra complication in the ATS comes from arrays being a linear type (whose "views" need tending), and from values of linear type having to be local to any function using them. This limitation could have been worked around, and arrays more similar to OCaml arrays could have been used, but at a cost in safety and efficiency.)
 
<langsyntaxhighlight OCamllang="ocaml">(*------------------------------------------------------------------*)
(* The Rosetta Code lexical analyzer, in OCaml. Based on the ATS. *)
 
Line 13,018 ⟶ 13,881:
main ()
 
(*------------------------------------------------------------------*)</langsyntaxhighlight>
 
{{out}}
Line 13,061 ⟶ 13,924:
Note: we do not print the line and token source code position for the simplicity.
 
<langsyntaxhighlight lang="scheme">
(import (owl parse))
 
Line 13,185 ⟶ 14,048:
(if (null? (cdr stream))
(print 'End_of_input))))
</syntaxhighlight>
</lang>
 
==== Testing ====
 
Testing function:
<langsyntaxhighlight lang="scheme">
(define (translate source)
(let ((stream (try-parse token-parser (str-iter source) #t)))
Line 13,196 ⟶ 14,059:
(if (null? (force (cdr stream)))
(print 'End_of_input))))
</syntaxhighlight>
</lang>
 
====== Testcase 1 ======
 
<langsyntaxhighlight lang="scheme">
(translate "
/*
Line 13,206 ⟶ 14,069:
*/
print(\"Hello, World!\\\\n\");
")</langsyntaxhighlight>
{{Out}}
<pre>
Line 13,219 ⟶ 14,082:
====== Testcase 2 ======
 
<langsyntaxhighlight lang="scheme">
(translate "
/*
Line 13,226 ⟶ 14,089:
phoenix_number = 142857;
print(phoenix_number, \"\\\\n\");
")</langsyntaxhighlight>
{{Out}}
<pre>
Line 13,245 ⟶ 14,108:
====== Testcase 3 ======
 
<langsyntaxhighlight lang="scheme">
(translate "
/*
Line 13,269 ⟶ 14,132:
/* character literal */ '\\\\'
/* character literal */ ' '
")</langsyntaxhighlight>
{{Out}}
<pre>
Line 13,310 ⟶ 14,173:
====== Testcase 4 ======
 
<langsyntaxhighlight lang="scheme">
(translate "
/*** test printing, embedded \\\\n and comments with lots of '*' ***/
Line 13,317 ⟶ 14,180:
print(\"Print a slash n - \\\\\\\\n.\\\\n\");
")
</syntaxhighlight>
</lang>
{{Out}}
<pre>
Line 13,340 ⟶ 14,203:
=={{header|Perl}}==
 
<langsyntaxhighlight lang="perl">#!/usr/bin/env perl
 
use strict;
Line 13,479 ⟶ 14,342:
($line, $col)
}
}</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 13,522 ⟶ 14,385:
===Alternate Perl Solution===
Tested on perl v5.26.1
<langsyntaxhighlight Perllang="perl">#!/usr/bin/perl
 
use strict; # lex.pl - source to tokens
Line 13,558 ⟶ 14,421:
1 + $` =~ tr/\n//, 1 + length $` =~ s/.*\n//sr, $^R;
}
printf "%5d %7d %s\n", 1 + tr/\n//, 1, 'End_of_input';</langsyntaxhighlight>
 
=={{header|Phix}}==
Line 13,565 ⟶ 14,428:
form. If required, demo\rosetta\Compiler\extra.e (below) contains some code that achieves the latter.
Code to print the human readable forms is likewise kept separate from any re-usable parts.
<!--<langsyntaxhighlight Phixlang="phix">(phixonline)-->
<span style="color: #000080;font-style:italic;">--
-- demo\rosetta\Compiler\core.e
Line 13,725 ⟶ 14,588:
<span style="color: #008080;">return</span> <span style="color: #000000;">s</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<!--</langsyntaxhighlight>-->
For running under pwa/p2js, we also have a "fake file/io" component:
<!--<langsyntaxhighlight Phixlang="phix">(phixonline)-->
<span style="color: #000080;font-style:italic;">--
-- demo\rosetta\Compiler\js_io.e
Line 13,736 ⟶ 14,599:
--</span>
<span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span>
<span style="color: #008080004080;">constantsequence</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">known_files</span><span style="color: #0000FF;">,</span><span style="color: #000000;">kfc</span><span style="color: #0000FF;">}</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">columnize</span><span style="color: #0000FF;">({</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"test3.c"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
/*
Line 13,801 ⟶ 14,664:
}
print(a);
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"Header.h"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
#define area(h, w) h * w
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)},</span>
<span style="color: #0000FF;">{</span><span style="color: #008000;">"Source.t"</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">split</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"""
#include "Header.h"
#define width 5
#define height 6
area = #area(height, width)#;
"""</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"\n"</span><span style="color: #0000FF;">)}})</span>
<span style="color: #004080;">integersequence</span> <span style="color: #000000;">fnlinenos</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">repeat</span><span style="color: #0000FF;">(-</span><span style="color: #000000;">1</span><span style="color: #0000FF;">,</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">linenoknown_files</span><span style="color: #0000FF;">))</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">js_open</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">filename</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">find</span><span style="color: #0000FF;">(</span><span style="color: #000000;">filename</span><span style="color: #0000FF;">,</span><span style="color: #000000;">known_files</span><span style="color: #0000FF;">)</span>
<span style="color: #7060A8;">assert</span><span style="color: #0000FF;">(</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">!=</span><span style="color: #000000;">0</span><span style="color: #0000FF;">)</span>
<span style="color: #000000;">linenolinenos</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">]</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">fn</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #008080;">global</span> <span style="color: #008080;">function</span> <span style="color: #000000;">js_gets</span><span style="color: #0000FF;">(</span><span style="color: #004080;">integer</span> <span style="color: #000000;">fn</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">integer</span> <span style="color: #000000;">lineno</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">linenos</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">]+</span><span style="color: #000000;">1</span>
<span style="color: #008080;">if</span> <span style="color: #000000;">lineno</span><span style="color: #0000FF;">><=</span><span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">kfc</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">])</span> <span style="color: #008080;">then</span> <span style="color: #008080;">return</span> <span style="color: #000000;">EOF</span> <span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">kfclinenos</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">][</span> <span style="color: #0000000000FF;">lineno=</span> <span style="color: #0000FF000000;">]lineno</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">kfc</span><span style="color: #0000FF;">[</span><span style="color: #000000;">fn</span><span style="color: #0000FF;">][</span><span style="color: #000000;">lineno</span><span style="color: #0000FF;">]</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
<span style="color: #008080;">return</span> <span style="color: #000000;">EOF</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<!--</langsyntaxhighlight>-->
The main lexer is also written to be reusable by later stages.
<!--<langsyntaxhighlight Phixlang="phix">(phixonline)-->
<span style="color: #000080;font-style:italic;">--
-- demo\\rosetta\\Compiler\\lex.e
Line 13,847 ⟶ 14,722:
<span style="color: #000000;">line</span> <span style="color: #0000FF;">+=</span> <span style="color: #000000;">1</span>
<span style="color: #000000;">col</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
<span style="color: #000000;">oneline</span> <span style="color: #0000FF;">=</span> <span style="color: #008080;">iff</span><span style="color: #0000FF;">(</span><span style="color: #7060A8;">platform</span><span style="color: #0000FF;">()=</span><span style="color: #004600;">JS</span><span style="color: #0000FF;">?</span><span style="color: #000000;">js_gets</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">)</span>
<span style="color: #0000FF;">:</span><span style="color: #7060A8;">gets</span><span style="color: #0000FF;">(</span><span style="color: #000000;">input_file</span><span style="color: #0000FF;">))</span>
<span style="color: #008080;">else</span>
Line 13,857 ⟶ 14,732:
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<span style="color: #000080;font-style:italic;">-- for pwa/p2js (JavaScript *really* dislikes tabs in strings):
--constant whitespace = " \t\r\n\x0B\xA0"</span>
<span style="color: #008080;">constant</span> <span style="color: #000000;">whitespace</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{</span><span style="color: #008000;">' '</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\t'</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\r'</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'\n'</span><span style="color: #0000FF;">,</span><span style="color: #000000;">#0B</span><span style="color: #0000FF;">,</span><span style="color: #000000;">#A0</span><span style="color: #0000FF;">}</span>
<span style="color: #000080;font-style:italic;">-- (0x0B is Vertical Tab, 0xA0 is Non-breaking space)</span>
Line 13,937 ⟶ 14,814:
<span style="color: #008080;">function</span> <span style="color: #000000;">get_op</span><span style="color: #0000FF;">()</span>
<span style="color: #000080;font-style:italic;">-- sequence operator = {ch}</span>
<span style="color: #004080;">string</span> <span style="color: #000000;">operator</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span><span style="color: #0000FF;">&</span><span style="color: #000000;">ch</span>
<span style="color: #000000;">ch</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">next_ch</span><span style="color: #0000FF;">()</span>
Line 14,003 ⟶ 14,881:
<span style="color: #008080;">return</span> <span style="color: #000000;">toks</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<!--</langsyntaxhighlight>-->
Optional: if you need human-readable output/input at each (later) stage, so you can use pipes
<!--<langsyntaxhighlight Phixlang="phix">-->
<span style="color: #000080;font-style:italic;">--
-- demo\rosetta\Compiler\extra.e
Line 14,058 ⟶ 14,936:
<span style="color: #008080;">return</span> <span style="color: #0000FF;">{</span><span style="color: #000000;">n_type</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">left</span><span style="color: #0000FF;">,</span> <span style="color: #000000;">right</span><span style="color: #0000FF;">}</span>
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
<!--</langsyntaxhighlight>-->
Finally, a simple test driver for the specific task:
<!--<langsyntaxhighlight Phixlang="phix">(phixonline)-->
<span style="color: #000080;font-style:italic;">--
-- demo\rosetta\Compiler\lex.exw
Line 14,088 ⟶ 14,966:
<span style="color: #000080;font-style:italic;">--main(command_line())</span>
<span style="color: #000000;">main</span><span style="color: #0000FF;">({</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span><span style="color: #000000;">0</span><span style="color: #0000FF;">,</span><span style="color: #008000;">"test4.c"</span><span style="color: #0000FF;">})</span>
<!--</langsyntaxhighlight>-->
{{out}}
<pre>
Line 14,111 ⟶ 14,989:
=={{header|Prolog}}==
 
<langsyntaxhighlight lang="prolog">/*
Test harness for the analyzer, not needed if we are actually using the output.
*/
Line 14,271 ⟶ 15,149:
 
% anything else is an error
tok(_,_,L,P) --> { format(atom(Error), 'Invalid token at line ~d,~d', [L,P]), throw(Error) }.</langsyntaxhighlight>
{{out}}
<pre>
Line 14,312 ⟶ 15,190:
=={{header|Python}}==
Tested with Python 2.7 and 3.x
<langsyntaxhighlight Pythonlang="python">from __future__ import print_function
import sys
 
Line 14,393 ⟶ 15,271:
#*** "string"
def string_lit(start, err_line, err_col):
global the_ch
text = ""
 
Line 14,400 ⟶ 15,279:
if the_ch == '\n':
error(err_line, err_col, "EOL while scanning string literal")
if the_ch == '\\':
next_ch()
if the_ch != 'n':
error(err_line, err_col, "escape sequence unknown \\%c" % the_ch)
the_ch = '\n'
text += the_ch
 
Line 14,487 ⟶ 15,371:
 
if tok == tk_EOI:
break</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 14,531 ⟶ 15,415:
=={{header|QB64}}==
Tested with QB64 1.5
<langsyntaxhighlight lang="vb">dim shared source as string, the_ch as string, tok as string, toktyp as string
dim shared line_n as integer, col_n as integer, text_p as integer, err_line as integer, err_col as integer, errors as integer
 
Line 14,771 ⟶ 15,655:
end
end sub
</syntaxhighlight>
</lang>
{{out|case=test case 3}}
<b>
Line 14,811 ⟶ 15,695:
 
=={{header|Racket}}==
<langsyntaxhighlight lang="racket">
#lang racket
(require parser-tools/lex)
Line 14,967 ⟶ 15,851:
"TEST 5"
(display-tokens (string->tokens test5))
</syntaxhighlight>
</lang>
 
=={{header|Raku}}==
Line 14,977 ⟶ 15,861:
{{works with|Rakudo|2016.08}}
 
<syntaxhighlight lang="raku" perl6line>grammar tiny_C {
rule TOP { ^ <.whitespace>? <tokens> + % <.whitespace> <.whitespace> <eoi> }
 
Line 15,070 ⟶ 15,954:
 
my $tokenizer = tiny_C.parse(@*ARGS[0].IO.slurp);
parse_it( $tokenizer );</langsyntaxhighlight>
 
{{out|case=test case 3}}
Line 15,109 ⟶ 15,993:
23 1 End_of_input
</pre>
 
=={{header|RATFOR}}==
{{works with|ratfor77|[https://sourceforge.net/p/chemoelectric/ratfor77/ public domain 1.0]}}
{{works with|gfortran|11.2.1}}
{{works with|f2c|20100827}}
 
 
<syntaxhighlight lang="ratfor">######################################################################
#
# The Rosetta Code scanner in Ratfor 77.
#
#
# How to deal with FORTRAN 77 input is a problem. I use formatted
# input, treating each line as an array of type CHARACTER--regrettably
# of no more than some predetermined, finite length. It is a very
# simple method and presents no significant difficulties, aside from
# the restriction on line length of the input.
#
#
# On a POSIX platform, the program can be compiled with f2c and run
# somewhat as follows:
#
# ratfor77 lex-in-ratfor.r > lex-in-ratfor.f
# f2c -C -Nc40 lex-in-ratfor.f
# cc -O lex-in-ratfor.c -lf2c
# ./a.out < compiler-tests/primes.t
#
# With gfortran, a little differently:
#
# ratfor77 lex-in-ratfor.r > lex-in-ratfor.f
# gfortran -O2 -fcheck=all -std=legacy lex-in-ratfor.f
# ./a.out < compiler-tests/primes.t
#
#
# I/O is strictly from default input and to default output, which, on
# POSIX systems, usually correspond respectively to standard input and
# standard output. (I did not wish to have to deal with unit numbers;
# these are now standardized in ISO_FORTRAN_ENV, but that is not
# available in FORTRAN 77.)
#
#---------------------------------------------------------------------
 
# Some parameters you may with to modify.
 
define(LINESZ, 256) # Size of an input line.
define(OUTLSZ, 512) # Size of an output line.
define(PSHBSZ, 10) # Size of the character pushback buffer.
define(STRNSZ, 4096) # Size of the string pool.
 
#---------------------------------------------------------------------
 
define(EOF, -1)
define(NEWLIN, 10) # Unix newline (the LF control character).
define(BACKSL, 92) # ASCII backslash.
 
define(ILINNO, 1) # Line number's index.
define(ICOLNO, 2) # Column number's index.
 
define(CHRSZ, 3) # See ILINNO and ICOLNO above.
define(ICHRCD, 3) # Character code's index.
 
define(TOKSZ, 5) # See ILINNO and ICOLNO above.
define(ITOKNO, 3) # Token number's index.
define(IARGIX, 4) # Index of the string pool index.
define(IARGLN, 5) # Index of the string length.
 
define(TKELSE, 0)
define(TKIF, 1)
define(TKPRNT, 2)
define(TKPUTC, 3)
define(TKWHIL, 4)
define(TKMUL, 5)
define(TKDIV, 6)
define(TKMOD, 7)
define(TKADD, 8)
define(TKSUB, 9)
define(TKNEG, 10)
define(TKLT, 11)
define(TKLE, 12)
define(TKGT, 13)
define(TKGE, 14)
define(TKEQ, 15)
define(TKNE, 16)
define(TKNOT, 17)
define(TKASGN, 18)
define(TKAND, 19)
define(TKOR, 20)
define(TKLPAR, 21)
define(TKRPAR, 22)
define(TKLBRC, 23)
define(TKRBRC, 24)
define(TKSEMI, 25)
define(TKCMMA, 26)
define(TKID, 27)
define(TKINT, 28)
define(TKSTR, 29)
define(TKEOI, 30)
 
define(LOC10, 1) # Location of "10" in the string pool.
define(LOC92, 3) # Location of "92" in the string pool.
 
#---------------------------------------------------------------------
 
subroutine addstr (strngs, istrng, src, i0, n0, i, n)
 
# Add a string to the string pool.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character src(*) # Source string.
integer i0, n0 # Index and length in source string.
integer i, n # Index and length in string pool.
 
integer j
 
if (STRNSZ < istrng + (n0 - 1))
{
write (*, '(''string pool exhausted'')')
stop
}
for (j = 0; j < n0; j = j + 1)
strngs(istrng + j) = src(i0 + j)
i = istrng
n = n0
istrng = istrng + n0
end
 
subroutine cpystr (strngs, i, n, dst, i0)
 
# Copy a string from the string pool to an output string.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer i, n # Index and length in string pool.
character dst(OUTLSZ) # Destination string.
integer i0 # Index within destination string.
 
integer j
 
if (i0 < 1 || OUTLSZ < i0 + (n - 1))
{
write (*, '(''string boundary exceeded'')')
stop
}
for (j = 0; j < n; j = j + 1)
dst(i0 + j) = strngs(i + j)
end
 
#---------------------------------------------------------------------
 
subroutine getchr (line, linno, colno, pushbk, npshbk, chr)
 
# Get a character, with its line number and column number.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer chr(CHRSZ)
 
# End of file is indicated (as in C) by a negative "char code"
# called "EOF".
 
character*20 fmt
integer stat
integer chr1(CHRSZ)
 
if (0 < npshbk)
{
chr(ICHRCD) = pushbk(ICHRCD, npshbk)
chr(ILINNO) = pushbk(ILINNO, npshbk)
chr(ICOLNO) = pushbk(ICOLNO, npshbk)
npshbk = npshbk - 1
}
else if (colno <= LINESZ)
{
chr(ICHRCD) = ichar (line(colno))
chr(ILINNO) = linno
chr(ICOLNO) = colno
colno = colno + 1
}
else
{
# Return a newline character.
chr(ICHRCD) = NEWLIN
chr(ILINNO) = linno
chr(ICOLNO) = colno
 
# Fetch a new line.
linno = linno + 1
colno = 1
 
# Read a line of text as an array of characters.
write (fmt, '(''('', I10, ''A1)'')') LINESZ
read (*, fmt, iostat = stat) line
 
if (stat != 0)
{
# If end of file has been reached, push an EOF.
chr1(ICHRCD) = EOF
chr1(ILINNO) = linno
chr1(ICOLNO) = colno
call pshchr (pushbk, npshbk, chr1)
}
}
end
 
subroutine pshchr (pushbk, npshbk, chr)
 
# Push back a character.
 
implicit none
 
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer chr(CHRSZ)
 
if (PSHBSZ <= npshbk)
{
write (*, '(''pushback buffer overfull'')')
stop
}
npshbk = npshbk + 1
pushbk(ICHRCD, npshbk) = chr(ICHRCD)
pushbk(ILINNO, npshbk) = chr(ILINNO)
pushbk(ICOLNO, npshbk) = chr(ICOLNO)
end
 
subroutine getpos (line, linno, colno, pushbk, npshbk, ln, cn)
 
# Get the position of the next character.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # The line and column nos. returned.
 
integer chr(CHRSZ)
 
call getchr (line, linno, colno, pushbk, npshbk, chr)
ln = chr(ILINNO)
cn = chr(ICOLNO)
call pshchr (pushbk, npshbk, chr)
end
 
#---------------------------------------------------------------------
 
function isspc (c)
 
# Is c character code for a space?
 
implicit none
 
integer c
logical isspc
 
#
# The following is correct for ASCII: 32 is the SPACE character, and
# 9 to 13 are control characters commonly regarded as spaces.
#
# In Unicode these are all code points for spaces, but so are others
# besides.
#
isspc = (c == 32 || (9 <= c && c <= 13))
end
 
function isdgt (c)
 
# Is c character code for a digit?
 
implicit none
 
integer c
logical isdgt
 
isdgt = (ichar ('0') <= c && c <= ichar ('9'))
end
 
function isalph (c)
 
# Is c character code for a letter?
 
implicit none
 
integer c
logical isalph
 
#
# The following is correct for ASCII and Unicode, but not for
# EBCDIC.
#
isalph = (ichar ('a') <= c && c <= ichar ('z')) _
|| (ichar ('A') <= c && c <= ichar ('Z'))
end
 
function isid0 (c)
 
# Is c character code for the start of an identifier?
 
implicit none
 
integer c
logical isid0
 
logical isalph
 
isid0 = isalph (c) || c == ichar ('_')
end
 
function isid1 (c)
 
# Is c character code for the continuation of an identifier?
 
implicit none
 
integer c
logical isid1
 
logical isalph
logical isdgt
 
isid1 = isalph (c) || isdgt (c) || c == ichar ('_')
end
 
#---------------------------------------------------------------------
 
function trimlf (str, n)
 
# "Trim left" leading spaces.
 
implicit none
 
character str(*) # The string to "trim".
integer n # The length.
integer trimlf # The index of the first non-space
# character, or n + 1.
 
logical isspc
 
integer j
logical done
 
j = 1
done = .false.
while (!done)
{
if (j == n + 1)
done = .true.
else if (!isspc (ichar (str(j))))
done = .true.
else
j = j + 1
}
trimlf = j
end
 
function trimrt (str, n)
 
# "Trim right" trailing spaces.
 
implicit none
 
character str(*) # The string to "trim".
integer n # The length including trailing spaces.
integer trimrt # The length without trailing spaces.
 
logical isspc
 
integer j
logical done
 
j = n
done = .false.
while (!done)
{
if (j == 0)
done = .true.
else if (!isspc (ichar (str(j))))
done = .true.
else
j = j - 1
}
trimrt = j
end
 
#---------------------------------------------------------------------
 
subroutine toknam (tokno, str, i)
 
# Copy a token name to the character array str, starting at i.
 
implicit none
 
integer tokno
character str(*)
integer i
integer j
 
character*16 names(0:30)
character*16 nm
 
data names / "Keyword_else ", _
"Keyword_if ", _
"Keyword_print ", _
"Keyword_putc ", _
"Keyword_while ", _
"Op_multiply ", _
"Op_divide ", _
"Op_mod ", _
"Op_add ", _
"Op_subtract ", _
"Op_negate ", _
"Op_less ", _
"Op_lessequal ", _
"Op_greater ", _
"Op_greaterequal ", _
"Op_equal ", _
"Op_notequal ", _
"Op_not ", _
"Op_assign ", _
"Op_and ", _
"Op_or ", _
"LeftParen ", _
"RightParen ", _
"LeftBrace ", _
"RightBrace ", _
"Semicolon ", _
"Comma ", _
"Identifier ", _
"Integer ", _
"String ", _
"End_of_input " /
 
nm = names(tokno)
for (j = 0; j < 16; j = j + 1)
str(i + j) = nm(1 + j : 1 + j)
end
 
subroutine intstr (str, i, n, x)
 
# Convert a positive integer to a substring.
 
implicit none
 
character str(*) # Destination string.
integer i, n # Index and length of the field.
integer x # The positive integer to represent.
 
integer j
integer y
 
if (x == 0)
{
for (j = 0; j < n - 1; j = j + 1)
str(i + j) = ' '
str(i + j) = '0'
}
else
{
y = x
for (j = n - 1; 0 <= j; j = j - 1)
{
if (y == 0)
str(i + j) = ' '
else
{
str(i + j) = char (mod (y, 10) + ichar ('0'))
y = y / 10
}
}
}
end
 
subroutine prttok (strngs, tok)
 
# Print a token.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer tok(TOKSZ) # The token.
 
integer trimrt
 
character line(OUTLSZ)
character*20 fmt
integer i, n
integer tokno
 
for (i = 1; i <= OUTLSZ; i = i + 1)
line(i) = ' '
 
call intstr (line, 1, 10, tok(ILINNO))
call intstr (line, 12, 10, tok(ICOLNO))
 
tokno = tok(ITOKNO)
call toknam (tokno, line, 25)
if (tokno == TKID || tokno == TKINT || tokno == TKSTR)
{
i = tok(IARGIX)
n = tok(IARGLN)
call cpystr (strngs, i, n, line, 45)
}
 
n = trimrt (line, OUTLSZ)
write (fmt, '(''('', I10, ''A)'')') n
write (*, fmt) (line(i), i = 1, n)
end
 
#---------------------------------------------------------------------
 
subroutine wrtpos (ln, cn)
 
implicit none
 
integer ln, cn
 
write (*, 1000) ln, cn
1000 format ('At line ', I5, ', column ' I5)
end
 
#---------------------------------------------------------------------
 
subroutine utcmnt (ln, cn)
 
implicit none
 
integer ln, cn
 
call wrtpos (ln, cn)
write (*, '(''Unterminated comment'')')
stop
end
 
subroutine skpcmt (line, linno, colno, pushbk, npshbk, ln, cn)
 
# Skip to the end of a comment.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # Line and column of start of comment.
 
integer chr(CHRSZ)
logical done
 
done = .false.
while (!done)
{
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (chr(ICHRCD) == ichar ('*'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (chr(ICHRCD) == ichar ('/'))
done = .true.
else if (chr(ICHRCD) == EOF)
call utcmnt (ln, cn)
}
else if (chr(ICHRCD) == EOF)
call utcmnt (ln, cn)
}
end
 
subroutine skpspc (line, linno, colno, pushbk, npshbk)
 
# Skip spaces and comments.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
 
logical isspc
 
integer chr(CHRSZ)
integer chr1(CHRSZ)
integer ln, cn
logical done
 
done = .false.
while (!done)
{
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (!isspc (chr(ICHRCD)))
{
if (chr(ICHRCD) != ichar ('/'))
{
call pshchr (pushbk, npshbk, chr)
done = .true.
}
else
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) != ichar ('*'))
{
call pshchr (pushbk, npshbk, chr1)
call pshchr (pushbk, npshbk, chr)
done = .true.
}
else
{
ln = chr(ILINNO)
cn = chr(ICOLNO)
call skpcmt (line, linno, colno, pushbk, npshbk, _
ln, cn)
}
}
}
}
end
 
#---------------------------------------------------------------------
 
subroutine rwdlkp (strngs, istrng, src, i0, n0, ln, cn, tok)
 
# Reserved word lookup
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character src(*) # The source string.
integer i0, n0 # Index and length of the substring.
integer ln, cn # Line and column number
# to associate with the token.
integer tok(TOKSZ) # The output token.
 
integer tokno
integer i, n
 
tokno = TKID
 
if (n0 == 2)
{
if (src(i0) == 'i' && src(i0 + 1) == 'f')
tokno = TKIF
}
else if (n0 == 4)
{
if (src(i0) == 'e' && src(i0 + 1) == 'l' _
&& src(i0 + 2) == 's' && src(i0 + 3) == 'e')
tokno = TKELSE
else if (src(i0) == 'p' && src(i0 + 1) == 'u' _
&& src(i0 + 2) == 't' && src(i0 + 3) == 'c')
tokno = TKPUTC
}
else if (n0 == 5)
{
if (src(i0) == 'p' && src(i0 + 1) == 'r' _
&& src(i0 + 2) == 'i' && src(i0 + 3) == 'n' _
&& src(i0 + 4) == 't')
tokno = TKPRNT
else if (src(i0) == 'w' && src(i0 + 1) == 'h' _
&& src(i0 + 2) == 'i' && src(i0 + 3) == 'l' _
&& src(i0 + 4) == 'e')
tokno = TKWHIL
}
 
i = 0
n = 0
if (tokno == TKID)
call addstr (strngs, istrng, src, i0, n0, i, n)
 
tok(ITOKNO) = tokno
tok(IARGIX) = i
tok(IARGLN) = n
tok(ILINNO) = ln
tok(ICOLNO) = cn
end
 
subroutine scnwrd (line, linno, colno, pushbk, npshbk, ln, cn, buf, n)
 
# Scan characters that may represent an identifier, reserved word,
# or integer literal.
 
implicit none
 
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # Line and column number of the start.
character buf(LINESZ) # The output buffer.
integer n # The length of the string collected.
 
logical isid1
 
integer chr(CHRSZ)
 
n = 0
call getchr (line, linno, colno, pushbk, npshbk, chr)
ln = chr(ILINNO)
cn = chr(ICOLNO)
while (isid1 (chr(ICHRCD)))
{
n = n + 1
buf(n) = char (chr(ICHRCD))
call getchr (line, linno, colno, pushbk, npshbk, chr)
}
call pshchr (pushbk, npshbk, chr)
end
 
subroutine scnidr (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan an identifier or reserved word.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # Line and column number of the start.
integer tok(TOKSZ) # The output token.
 
character buf(LINESZ)
integer n
call scnwrd (line, linno, colno, pushbk, npshbk, ln, cn, buf, n)
call rwdlkp (strngs, istrng, buf, 1, n, ln, cn, tok)
end
 
subroutine scnint (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a positive integer literal.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer ln, cn # Line and column number of the start.
integer tok(TOKSZ) # The output token.
 
logical isdgt
 
character buf(LINESZ)
integer n0, n
integer i, j, k
character*80 fmt
call scnwrd (line, linno, colno, pushbk, npshbk, ln, cn, buf, n0)
for (j = 1; j <= n0; j = j + 1)
if (!isdgt (ichar (buf(j))))
{
call wrtpos (ln, cn)
write (fmt, 1000) n0
1000 format ('(''Not a legal word: "''', I10, 'A, ''"'')')
write (*, fmt) (buf(k), k = 1, n0)
stop
}
 
call addstr (strngs, istrng, buf, 1, n0, i, n)
 
tok(ITOKNO) = TKINT
tok(IARGIX) = i
tok(IARGLN) = n
tok(ILINNO) = ln
tok(ICOLNO) = cn
end
 
subroutine utclit (ln, cn)
 
implicit none
 
integer ln, cn
 
call wrtpos (ln, cn)
write (*, '(''Unterminated character literal'')')
stop
end
 
subroutine scnch1 (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a character literal, without yet checking that the literal
# ends correctly.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer tok(TOKSZ) # The output token.
 
integer trimlf
 
integer bufsz
parameter (bufsz = 40)
 
integer chr(CHRSZ)
integer chr1(CHRSZ)
integer chr2(CHRSZ)
integer ln, cn
character buf(bufsz)
integer i, j, n
 
# Refetch the opening quote.
call getchr (line, linno, colno, pushbk, npshbk, chr)
ln = chr(ILINNO)
cn = chr(ICOLNO)
 
tok(ITOKNO) = TKINT
tok(ILINNO) = ln
tok(ICOLNO) = cn
 
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == EOF)
call utclit (ln, cn)
if (chr1(ICHRCD) == BACKSL)
{
call getchr (line, linno, colno, pushbk, npshbk, chr2)
if (chr2(ICHRCD) == EOF)
call utclit (ln, cn)
else if (chr2(ICHRCD) == ichar ('n'))
{
tok(IARGIX) = LOC10 # "10" = code for Unix newline
tok(IARGLN) = 2
}
else if (chr2(ICHRCD) == BACKSL)
{
tok(IARGIX) = LOC92 # "92" = code for backslash
tok(IARGLN) = 2
}
else
{
call wrtpos (ln, cn)
write (*, '(''Unsupported escape: '', 1A)') _
char (chr2(ICHRCD))
stop
}
}
else
{
# Character codes are non-negative, so we can use intstr.
call intstr (buf, 1, bufsz, chr1(ICHRCD))
 
j = trimlf (buf, bufsz)
call addstr (strngs, istrng, buf, j, bufsz - (j - 1), i, n)
tok(IARGIX) = i
tok(IARGLN) = n
}
end
 
subroutine scnch (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a character literal.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer tok(TOKSZ) # The output token.
 
integer ln, cn
integer chr(CHRSZ)
 
call getpos (line, linno, colno, pushbk, npshbk, ln, cn)
call scnch1 (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (chr(ICHRCD) != ichar (''''))
{
while (.true.)
{
if (chr(ICHRCD) == EOF)
{
call utclit (ln, cn)
stop
}
else if (chr(ICHRCD) == ichar (''''))
{
call wrtpos (ln, cn)
write (*, '(''Unsupported multicharacter literal'')')
stop
}
call getchr (line, linno, colno, pushbk, npshbk, chr)
}
}
end
 
subroutine scnstr (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a string literal.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer tok(TOKSZ) # The output token.
 
integer ln, cn
integer chr1(CHRSZ)
integer chr2(CHRSZ)
character buf(LINESZ + 10) # Enough space, with some room to spare.
integer n0
integer i, n
 
call getchr (line, linno, colno, pushbk, npshbk, chr1)
ln = chr1(ILINNO)
cn = chr1(ICOLNO)
 
tok(ITOKNO) = TKSTR
tok(ILINNO) = ln
tok(ICOLNO) = cn
 
n0 = 1
buf(n0) = '"'
 
call getchr (line, linno, colno, pushbk, npshbk, chr1)
while (chr1(ICHRCD) != ichar ('"'))
{
# Our input method always puts a NEWLIN before EOF, and so this
# test is redundant, unless someone changes the input method.
if (chr1(ICHRCD) == EOF || chr1(ICHRCD) == NEWLIN)
{
call wrtpos (ln, cn)
write (*, '(''Unterminated string literal'')')
stop
}
if (chr1(ICHRCD) == BACKSL)
{
call getchr (line, linno, colno, pushbk, npshbk, chr2)
if (chr2(ICHRCD) == ichar ('n'))
{
n0 = n0 + 1
buf(n0) = char (BACKSL)
n0 = n0 + 1
buf(n0) = 'n'
}
else if (chr2(ICHRCD) == BACKSL)
{
n0 = n0 + 1
buf(n0) = char (BACKSL)
n0 = n0 + 1
buf(n0) = char (BACKSL)
}
else
{
call wrtpos (chr1(ILINNO), chr1(ICOLNO))
write (*, '(''Unsupported escape sequence'')')
stop
}
}
else
{
n0 = n0 + 1
buf(n0) = char (chr1(ICHRCD))
}
call getchr (line, linno, colno, pushbk, npshbk, chr1)
}
n0 = n0 + 1
buf(n0) = '"'
 
call addstr (strngs, istrng, buf, 1, n0, i, n)
tok(IARGIX) = i
tok(IARGLN) = n
end
 
subroutine unxchr (chr)
 
implicit none
 
integer chr(CHRSZ)
 
call wrtpos (chr(ILINNO), chr(ICOLNO))
write (*, 1000) char (chr(ICHRCD))
1000 format ('Unexpected character ''', A1, '''')
stop
end
 
subroutine scntok (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
 
# Scan a token.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
integer tok(TOKSZ) # The output token.
 
logical isdgt
logical isid0
 
integer chr(CHRSZ)
integer chr1(CHRSZ)
integer ln, cn
 
call getchr (line, linno, colno, pushbk, npshbk, chr)
ln = chr(ILINNO)
cn = chr(ICOLNO)
tok(ILINNO) = ln
tok(ICOLNO) = cn
tok(IARGIX) = 0
tok(IARGLN) = 0
if (chr(ICHRCD) == ichar (','))
tok(ITOKNO) = TKCMMA
else if (chr(ICHRCD) == ichar (';'))
tok(ITOKNO) = TKSEMI
else if (chr(ICHRCD) == ichar ('('))
tok(ITOKNO) = TKLPAR
else if (chr(ICHRCD) == ichar (')'))
tok(ITOKNO) = TKRPAR
else if (chr(ICHRCD) == ichar ('{'))
tok(ITOKNO) = TKLBRC
else if (chr(ICHRCD) == ichar ('}'))
tok(ITOKNO) = TKRBRC
else if (chr(ICHRCD) == ichar ('*'))
tok(ITOKNO) = TKMUL
else if (chr(ICHRCD) == ichar ('/'))
tok(ITOKNO) = TKDIV
else if (chr(ICHRCD) == ichar ('%'))
tok(ITOKNO) = TKMOD
else if (chr(ICHRCD) == ichar ('+'))
tok(ITOKNO) = TKADD
else if (chr(ICHRCD) == ichar ('-'))
tok(ITOKNO) = TKSUB
else if (chr(ICHRCD) == ichar ('<'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('='))
tok(ITOKNO) = TKLE
else
{
call pshchr (pushbk, npshbk, chr1)
tok(ITOKNO) = TKLT
}
}
else if (chr(ICHRCD) == ichar ('>'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('='))
tok(ITOKNO) = TKGE
else
{
call pshchr (pushbk, npshbk, chr1)
tok(ITOKNO) = TKGT
}
}
else if (chr(ICHRCD) == ichar ('='))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('='))
tok(ITOKNO) = TKEQ
else
{
call pshchr (pushbk, npshbk, chr1)
tok(ITOKNO) = TKASGN
}
}
else if (chr(ICHRCD) == ichar ('!'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('='))
tok(ITOKNO) = TKNE
else
{
call pshchr (pushbk, npshbk, chr1)
tok(ITOKNO) = TKNOT
}
}
else if (chr(ICHRCD) == ichar ('&'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('&'))
tok(ITOKNO) = TKAND
else
call unxchr (chr)
}
else if (chr(ICHRCD) == ichar ('|'))
{
call getchr (line, linno, colno, pushbk, npshbk, chr1)
if (chr1(ICHRCD) == ichar ('|'))
tok(ITOKNO) = TKOR
else
call unxchr (chr)
}
else if (chr(ICHRCD) == ichar ('"'))
{
call pshchr (pushbk, npshbk, chr)
call scnstr (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
}
else if (chr(ICHRCD) == ichar (''''))
{
call pshchr (pushbk, npshbk, chr)
call scnch (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
}
else if (isdgt (chr(ICHRCD)))
{
call pshchr (pushbk, npshbk, chr)
call scnint (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
}
else if (isid0 (chr(ICHRCD)))
{
call pshchr (pushbk, npshbk, chr)
call scnidr (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
}
else
call unxchr (chr)
end
 
subroutine scntxt (strngs, istrng, _
line, linno, colno, pushbk, npshbk)
 
# Scan the text and print the token stream.
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
 
integer chr(CHRSZ)
integer tok(TOKSZ)
 
chr(ICHRCD) = ichar ('x')
while (chr(ICHRCD) != EOF)
{
call skpspc (line, linno, colno, pushbk, npshbk)
call getchr (line, linno, colno, pushbk, npshbk, chr)
if (chr(ICHRCD) != EOF)
{
call pshchr (pushbk, npshbk, chr)
call scntok (strngs, istrng, _
line, linno, colno, pushbk, npshbk, _
tok)
call prttok (strngs, tok)
}
}
tok(ITOKNO) = TKEOI
tok(ILINNO) = chr(ILINNO)
tok(ICOLNO) = chr(ICOLNO)
tok(IARGIX) = 0
tok(IARGLN) = 0
call prttok (strngs, tok)
end
 
#---------------------------------------------------------------------
 
program lex
 
implicit none
 
character strngs(STRNSZ) # String pool.
integer istrng # String pool's next slot.
character line(LINESZ) # Input buffer.
integer linno, colno # Current line and column numbers.
integer pushbk(CHRSZ, PSHBSZ) # Pushback buffer.
integer npshbk # Number of characters pushed back.
 
integer i, n
 
istrng = 1
 
# Locate "10" (newline) at 1 in the string pool.
line(1) = '1'
line(2) = '0'
call addstr (strngs, istrng, line, 1, 2, i, n)
if (i != 1 && n != 2)
{
write (*, '(''internal error'')')
stop
}
 
# Locate "92" (backslash) at 3 in the string pool.
line(1) = '9'
line(2) = '2'
call addstr (strngs, istrng, line, 1, 2, i, n)
if (i != 3 && n != 2)
{
write (*, '(''internal error'')')
stop
}
 
linno = 0
colno = LINESZ + 1 # This will trigger a READ.
npshbk = 0
 
call scntxt (strngs, istrng, line, linno, colno, pushbk, npshbk)
end
 
######################################################################</syntaxhighlight>
 
 
{{out}}
<pre>$ ratfor77 lex-in-ratfor.r > lex-in-ratfor.f && gfortran -O2 -std=legacy -fcheck=all lex-in-ratfor.f && ./a.out < compiler-tests/primes.t
4 1 Identifier count
4 7 Op_assign
4 9 Integer 1
4 10 Semicolon
5 1 Identifier n
5 3 Op_assign
5 5 Integer 1
5 6 Semicolon
6 1 Identifier limit
6 7 Op_assign
6 9 Integer 100
6 12 Semicolon
7 1 Keyword_while
7 7 LeftParen
7 8 Identifier n
7 10 Op_less
7 12 Identifier limit
7 17 RightParen
7 19 LeftBrace
8 5 Identifier k
8 6 Op_assign
8 7 Integer 3
8 8 Semicolon
9 5 Identifier p
9 6 Op_assign
9 7 Integer 1
9 8 Semicolon
10 5 Identifier n
10 6 Op_assign
10 7 Identifier n
10 8 Op_add
10 9 Integer 2
10 10 Semicolon
11 5 Keyword_while
11 11 LeftParen
11 12 LeftParen
11 13 Identifier k
11 14 Op_multiply
11 15 Identifier k
11 16 Op_lessequal
11 18 Identifier n
11 19 RightParen
11 21 Op_and
11 24 LeftParen
11 25 Identifier p
11 26 RightParen
11 27 RightParen
11 29 LeftBrace
12 9 Identifier p
12 10 Op_assign
12 11 Identifier n
12 12 Op_divide
12 13 Identifier k
12 14 Op_multiply
12 15 Identifier k
12 16 Op_notequal
12 18 Identifier n
12 19 Semicolon
13 9 Identifier k
13 10 Op_assign
13 11 Identifier k
13 12 Op_add
13 13 Integer 2
13 14 Semicolon
14 5 RightBrace
15 5 Keyword_if
15 8 LeftParen
15 9 Identifier p
15 10 RightParen
15 12 LeftBrace
16 9 Keyword_print
16 14 LeftParen
16 15 Identifier n
16 16 Comma
16 18 String " is prime\n"
16 31 RightParen
16 32 Semicolon
17 9 Identifier count
17 15 Op_assign
17 17 Identifier count
17 23 Op_add
17 25 Integer 1
17 26 Semicolon
18 5 RightBrace
19 1 RightBrace
20 1 Keyword_print
20 6 LeftParen
20 7 String "Total primes found: "
20 29 Comma
20 31 Identifier count
20 36 Comma
20 38 String "\n"
20 42 RightParen
20 43 Semicolon
21 1 End_of_input</pre>
 
=={{header|Scala}}==
Line 15,115 ⟶ 17,336:
The following code implements a configurable (from a symbol map and keyword map provided as parameters) lexical analyzer.
 
<langsyntaxhighlight lang="scala">
package xyz.hyperreal.rosettacodeCompiler
 
Line 15,376 ⟶ 17,597:
 
}
</syntaxhighlight>
</lang>
 
=={{header|Scheme}}==
 
<langsyntaxhighlight lang="scheme">
(import (scheme base)
(scheme char)
Line 15,577 ⟶ 17,798:
(display-tokens (lexer (cadr (command-line))))
(display "Error: provide program filename\n"))
</syntaxhighlight>
</lang>
 
{{out}}
Line 15,595 ⟶ 17,816:
 
 
<langsyntaxhighlight SMLlang="sml">(*------------------------------------------------------------------*)
(* The Rosetta Code lexical analyzer, in Standard ML. Based on the ATS
and the OCaml. The intended compiler is Mlton or Poly/ML; there is
Line 16,401 ⟶ 18,622:
(* sml-indent-args: 2 *)
(* end: *)
(*------------------------------------------------------------------*)</langsyntaxhighlight>
 
 
Line 16,455 ⟶ 18,676:
{{libheader|Wren-fmt}}
{{libheader|Wren-ioutil}}
<langsyntaxhighlight ecmascriptlang="wren">import "./dynamic" for Enum, Struct, Tuple
import "./str" for Char
import "./fmt" for Fmt
import "./ioutil" for FileUtil
import "os" for Process
 
Line 16,804 ⟶ 19,025:
lineCount = lines.count
initLex.call()
process.call()</langsyntaxhighlight>
 
{{out}}
Line 16,846 ⟶ 19,067:
 
=={{header|Zig}}==
<langsyntaxhighlight lang="zig">
const std = @import("std");
 
Line 17,255 ⟶ 19,476:
return result.items;
}
</syntaxhighlight>
</lang>
9,476

edits