Tokenize a string with escaping: Difference between revisions

m
→‎{{header|Wren}}: Changed to Wren S/H
m (→‎{{header|Sidef}}: this output is incorrect)
m (→‎{{header|Wren}}: Changed to Wren S/H)
 
(47 intermediate revisions by 29 users not shown)
Line 61:
(Print the output list in any format you like, as long as it is it easy to see what the fields are.)
 
{{Template:Strings}}
 
;Related tasks:
*   [[Tokenize a string]]
*   [[Split a character string based on change of character|split a character string based on change of character]]
*   [[Brace expansion]]
<br><br>
 
=={{Headerheader|AppleScript11l}}==
{{trans|Python}}
 
<syntaxhighlight lang="11l">F token_with_escape(a, escape = ‘^’, separator = ‘|’)
[String] result
V token = ‘’
V state = 0
L(c) a
I state == 0
I c == escape
state = 1
E I c == separator
result.append(token)
token = ‘’
E
token ‘’= c
E I state == 1
token ‘’= c
state = 0
result.append(token)
R result
 
print(token_with_escape(‘one^|uno||three^^^^|four^^^|^cuatro|’).map(s -> ‘'’s‘'’).join(‘, ’))</syntaxhighlight>
 
{{out}}
<pre>
'one|uno', '', 'three^^', 'four^|cuatro', ''
</pre>
 
=={{header|8080 Assembly}}==
 
<syntaxhighlight lang="8080asm"> org 100h
jmp demo
;;; Routine to split a 0-terminated string
;;; Input: B=separator, C=escape, HL=string pointer.
;;; Output: DE=end of list of strings
;;; The split strings are stored in place.
split: mov d,h ; Set DE = output pointer
mov e,l
snext: mov a,m ; Get current input character
inx h ; Advance input pointer
stax d ; Write character at output pointer
ana a ; If zero, we are done
rz
cmp c ; Is it the escape character?
jz sesc
cmp b ; Is it the separator character?
jz ssep
inx d ; Otherwise, advance output pointer,
jmp snext ; and get the next character
sesc: mov a,m ; Store the escaped character without
inx h ; checking for anything except zero.
stax d
inx d
ana a ; Zero is still end of string
rz
jmp snext
ssep: xra a ; End of string, write zero terminator
stax d
inx d
jmp snext
;;; Use the routine to split the test-case string
demo: mvi b,'|' ; Separator character
mvi c,'^' ; Escape character
lxi h,test ; Pointer to test string
call split
;;; Print each string on its own line
lxi h,test
str: call puts ; Print string
call cmp16 ; Are we there yet?
jnc str ; If not, print the next string
ret
;;; 16-bit compare
cmp16: mov a,d
cmp h
rnz
mov a,e
cmp l
ret
;;; Print zero-terminated string with newline
puts: push d ; Keep DE registers
push h ; Keep pointer
lxi d,pfx ; Print prefix
mvi c,9
call 5
pop h ; Restore pointer
ploop: mov e,m ; Get current character
push h ; Keep pointer
mvi c,2 ; CP/M print character
call 5
pop h ; Restore pointer
mov a,m ; Is character zero?
ora a
inx h ; Increment pointer
jnz ploop ; If not, there are more characters
push h ; Keep pointer
lxi d,nl ; Write newline
mvi c,9 ; CP/M print string
call 5
pop h
pop d ; Restore DE registers
ret
pfx: db '> $' ; Prefix to make the output more obvious
nl: db 13,10,'$'
test: db 'one^|uno||three^^^^|four^^^|^cuatro|',0</syntaxhighlight>
 
{{out}}
 
<pre>> one|uno
>
> three^^
> four^|cuatro
>
</pre>
 
=={{header|Action!}}==
<syntaxhighlight lang="action!">DEFINE PTR="CARD"
 
TYPE Tokens=[
PTR buf ;BYTE ARRAY
PTR arr ;CARD ARRAY
PTR endPtr
BYTE count]
 
PROC Init(Tokens POINTER t BYTE ARRAY b PTR ARRAY a)
t.buf=b
t.arr=a
t.endPtr=b
t.count=0
RETURN
 
PROC AddToken(Tokens POINTER t CHAR ARRAY s)
PTR ARRAY a
CHAR ARRAY tmp
 
a=t.arr
tmp=t.endPtr
SCopy(tmp,s)
a(t.count)=tmp
t.count==+1
t.endPtr=t.endPtr+s(0)+1
RETURN
 
PROC PrintTokens(Tokens POINTER t)
BYTE i
PTR ARRAY a
a=t.arr
FOR i=0 TO t.count-1
DO
PrintF("""%S""%E",a(i))
OD
RETURN
 
PROC Append(CHAR ARRAY s CHAR c)
s(0)==+1
s(s(0))=c
RETURN
 
PROC Tokenize(CHAR ARRAY s CHAR sep,esc Tokens POINTER res)
BYTE ARRAY b(200)
PTR ARRAY a(20)
CHAR ARRAY tmp(255)
BYTE i,isEsc
CHAR c
 
Init(res,b,a)
isEsc=0
tmp(0)=0
FOR i=1 TO s(0)
DO
c=s(i)
IF isEsc THEN
isEsc=0
Append(tmp,c)
ELSE
IF c=esc THEN
isEsc=1
ELSEIF c=sep THEN
AddToken(res,tmp)
tmp(0)=0
ELSE
Append(tmp,c)
FI
FI
OD
AddToken(res,tmp)
RETURN
 
PROC Main()
Tokens t
 
Tokenize("one^|uno||three^^^^|four^^^|^cuatro|",'|,'^,t)
PrintTokens(t)
RETURN</syntaxhighlight>
{{out}}
[https://gitlab.com/amarok8bit/action-rosetta-code/-/raw/master/images/Tokenize_a_string_with_escaping.png Screenshot from Atari 8-bit computer]
<pre>
"one|uno"
""
"three^^"
"four^|cuatro"
""
</pre>
 
=={{header|Ada}}==
<syntaxhighlight lang="ada">with Ada.Text_Io;
with Ada.Containers.Indefinite_Vectors;
with Ada.Strings.Unbounded;
 
procedure Tokenize is
 
package String_Vectors is
new Ada.Containers.Indefinite_Vectors (Positive, String);
use String_Vectors;
 
function Split (Text : String;
Separator : Character := '|';
Escape : Character := '^') return Vector
is
use Ada.Strings.Unbounded;
Result : Vector;
Escaped : Boolean := False;
Accu : Unbounded_String;
begin
 
for Char of Text loop
 
case Escaped is
 
when False =>
if Char = Escape then
Escaped := True;
elsif Char = Separator then
Append (Result, To_String (Accu));
Accu := Null_Unbounded_String;
else
Append (Accu, Char);
end if;
 
when True =>
Append (Accu, Char);
Escaped := False;
 
end case;
 
end loop;
Append (Result, To_String (Accu));
 
return Result;
end Split;
 
procedure Put_Vector (List : Vector) is
use Ada.Text_Io;
begin
for Element of List loop
Put ("'"); Put (Element); Put ("'"); New_Line;
end loop;
end Put_Vector;
 
begin
Put_Vector (Split ("one^|uno||three^^^^|four^^^|^cuatro|"));
end Tokenize;</syntaxhighlight>
{{out}}
<pre>'one|uno'
''
'three^^'
'four^|cuatro'
''</pre>
 
=={{header|ALGOL 68}}==
<syntaxhighlight lang="algol68">BEGIN
# returns s parsed according to delimiter and escape #
PROC parse with escapes = ( STRING s, CHAR delimiter, escape )[]STRING:
IF ( UPB s - LWB s ) + 1 < 1 THEN
# empty string #
[ 1 : 0 ]STRING empty array;
empty array
ELSE
# at least one character #
# allow for a string composed entirely of delimiter characters #
[ 1 : ( UPB s - LWB s ) + 3 ]STRING result;
INT r pos := 1;
INT s pos := LWB s;
result[ r pos ] := "";
WHILE s pos <= UPB s DO
CHAR c = s[ s pos ];
IF c = delimiter THEN
# start a new element #
result[ r pos +:= 1 ] := ""
ELIF c = escape THEN
# use the next character even if it is an escape #
s pos +:= 1;
IF s pos < UPB s THEN
# the escape is not the last character #
result[ r pos ] +:= s[ s pos ]
FI
ELSE
# normal character #
result[ r pos ] +:= c
FI;
s pos +:= 1
OD;
result[ 1 : r pos ]
FI; # parse with escapes #
# task test case #
[]STRING tokens = parse with escapes( "one^|uno||three^^^^|four^^^|^cuatro|", "|", "^" );
FOR t pos FROM LWB tokens TO UPB tokens DO print( ( "[", tokens[ t pos ], "]", newline ) ) OD
END</syntaxhighlight>
{{out}}
<pre>
[one|uno]
[]
[three^^]
[four^|cuatro]
[]
</pre>
 
=={{header|AppleScript}}==
 
{{Trans|JavaScript}}
<langsyntaxhighlight AppleScriptlang="applescript">-- TOKENIZE -------------------------------------------------- TOKENIZE WITH ESCAPING ----------------
 
-- tokenize :: String -> Character -> Character -> [String]
on tokenize(str, chrDelimdelimChar, chrEsc)
script charParse
Line 83 ⟶ 397:
set blnEscChar to ((not blnEsc) and (x = chrEsc))
if ((not blnEsc) and (x = chrDelimdelimChar)) then
set strTokenk to ""
set lstTokensks to (tokens of a) & token of a
else
set strTokenk to (token of a) & cond(blnEscChar, "", x)
set lstTokensks to tokens of (a)
end if
{esc:blnEscChar, token:strTokenk, tokens:lstTokensks}
end |λ|
end script
Line 102 ⟶ 416:
 
 
-- TEST --------------------------------------------- TEST -------------------------
on run
script numberedLine
Line 116 ⟶ 430:
 
 
-- GENERIC FUNCTIONS -------------------------------------- GENERIC FUNCTIONS -------------------
 
-- foldl :: (a -> b -> a) -> a -> [b] -> a
Line 129 ⟶ 443:
end tell
end foldl
 
 
-- Lift 2nd class handler function into 1st class script wrapper
Line 142 ⟶ 457:
end mReturn
 
 
-- splitOn :: Text -> Text -> [Text]
-- splitOn :: String -> String -> [String]
on splitOn(strDelim, strMain)
on splitOn(pat, src)
set {dlm, my text item delimiters} to {my text item delimiters, strDelim}
set xs{dlm, tomy text itemsitem ofdelimiters} to strMain¬
{my text item delimiters, pat}
set xs to text items of src
set my text item delimiters to dlm
return xs
end splitOn
 
 
-- cond :: Bool -> a -> a -> a
Line 157 ⟶ 475:
g
end if
end cond</langsyntaxhighlight>
{{Out}}
<pre>1: one|uno
Line 163 ⟶ 481:
3: three^^
4: four^|cuatro
5: </pre>
 
</pre>
=={{header|Arturo}}==
<syntaxhighlight lang="rebol">tokenize: function [s sep esc][
escaping: 0
 
loop 0..(size s)-1 [i][
{{trans|AWK}}
chr: get split s i
 
if? escaping=1 [
<lang arturo>
tokenize: @(s sep esc){
escaping: 0
loop [range 0 [size s]-1] @(i){
chr: [chars s].[i]
 
if escaping=1 {
prints chr
escaping: 0
} {]
else [
if chr=sep {
printcase ""[chr]
when? [=sep] [print ""]
 
when? [=esc] [escaping: 1]
} {
ifelse chr=esc[prints {chr]
]
escaping: 1
]
} {
prints chr
}
}
}
}
print ""
]
}</lang>
 
str: "one^|uno||three^^^^|four^^^|^cuatro|"
tokenize str "|" "^"</syntaxhighlight>
 
{{out}}
Line 199 ⟶ 512:
 
three^^
four^|cuatro</pre>
 
=={{header|AutoHotkey}}==
</pre>
<syntaxhighlight lang="autohotkey">Tokenize(s,d,e){
 
for i,v in x:=StrSplit(StrReplace(StrReplace(StrReplace(s,e e,Chr(0xFFFE)),e d,Chr(0xFFFF)),e),d)
=={{header|AWK}}==
x[i]:=StrReplace(StrReplace(v,Chr(0xFFFE),e),Chr(0xFFFF),d)
<lang AWK>
return x
# syntax: GAWK -f TOKENIZE_A_STRING_WITH_ESCAPING.AWK
}</syntaxhighlight>
BEGIN {
Examples:<syntaxhighlight lang="autohotkey">str := tokenize("one^|uno||three^^^^|four^^^|^cuatro|","|","^")
for i, v in Tokenize(str, "|", "^")
exit(0)
output .= i " : " v "`n"
}
MsgBox % output</syntaxhighlight>
function tokenize(str,sep,esc, chr,escaping,field,i) {
printf(">%s<\n",str)
printf("%02d: >",++field)
for (i=1; i<=length(str); i++) {
chr = substr(str,i,1)
if (escaping == 1) {
printf("%s",chr)
escaping = 0
}
else if (chr == sep) {
printf("<\n%02d: >",++field)
}
else if (chr == esc) {
escaping = 1
}
else {
printf("%s",chr)
}
}
printf("<\n")
}
</lang>
{{out}}
<pre>1 : one|uno
2 :
>one^|uno||three^^^^|four^^^|^cuatro|<
3 : three^^
01: >one|uno<
4 : four^|cuatro
02: ><
5 : </pre>
03: >three^^<
04: >four^|cuatro<
05: ><
</pre>
 
=={{header|BBC BASIC}}==
<langsyntaxhighlight lang="bbcbasic">REM >tokenizer
PROC_tokenize("one^|uno||three^^^^|four^^^|^cuatro|", "|", "^")
END
Line 271 ⟶ 560:
NEXT
PRINT
ENDPROC</langsyntaxhighlight>
{{out}}
<pre> 1 one|uno
Line 278 ⟶ 567:
4 four^|cuatro
5 </pre>
 
=={{header|BQN}}==
 
<syntaxhighlight lang="bqn">str ← "one^|uno||three^^^^|four^^^|^cuatro|"
Split ← ((⊢-˜+`׬)∘=⊔⊢)
SplitE ← {
esc ← <`'^'=𝕩
rem ← »esc
spl ← (¬rem)∧'|'=𝕩
𝕩⊔˜(⊢-(esc∨spl)×1⊸+)+`spl
}
 
•Show SplitE str</syntaxhighlight>
<syntaxhighlight lang="text">⟨ "one|uno" ⟨⟩ "three^^" "four^|cuatro" ⟩</syntaxhighlight>
 
=={{header|C}}==
{{works with|C}}
 
<langsyntaxhighlight lang="c">#include <stdlib.h>
#include <stdio.h>
 
Line 382 ⟶ 685:
 
return list;
}</langsyntaxhighlight>
 
{{Out}}
Line 396 ⟶ 699:
5.
</pre>
 
=={{header|C++}}==
<lang cpp>#include <iostream>
#include <string>
#include <vector>
 
using namespace std;
 
vector<string> tokenize(string input, char seperator, char escape) {
vector<string> output;
string token = "";
 
bool inEsc = false;
for (char ch : input) {
if (inEsc) {
inEsc = false;
}
else if (ch == escape) {
inEsc = true;
continue;
}
else if (ch == seperator) {
output.push_back(token);
token = "";
continue;
}
token += ch;
}
if (inEsc) {
throw new exception("Invalid terminal escape");
}
 
output.push_back(token);
return output;
}
 
int main() {
string sample = "one^|uno||three^^^^|four^^^|^cuatro|";
 
cout << sample << endl;
cout << "[";
for (auto t : tokenize(sample, '|', '^')) {
cout << '"' << t << "\", ";
}
cout << "]" << endl;
 
return 0;
}</lang>
{{out}}
<pre>one^|uno||three^^^^|four^^^|^cuatro|
["one|uno", "", "three^^", "four^|cuatro", "", ]</pre>
 
=={{header|C sharp}}==
<langsyntaxhighlight lang="csharp">using System;
using System.Text;
using System.Collections.Generic;
Line 489 ⟶ 741:
return result;
}
}</langsyntaxhighlight>
{{out}}
<pre>
Line 497 ⟶ 749:
: four^|cuatro
: </pre>
 
=={{header|C++}}==
<syntaxhighlight lang="cpp">#include <iostream>
#include <stdexcept>
#include <string>
#include <vector>
 
using namespace std;
 
vector<string> tokenize(const string& input, char seperator, char escape) {
vector<string> output;
string token;
 
bool inEsc = false;
for (char ch : input) {
if (inEsc) {
inEsc = false;
} else if (ch == escape) {
inEsc = true;
continue;
} else if (ch == seperator) {
output.push_back(token);
token = "";
continue;
}
token += ch;
}
if (inEsc)
throw new invalid_argument("Invalid terminal escape");
 
output.push_back(token);
return output;
}
 
int main() {
string sample = "one^|uno||three^^^^|four^^^|^cuatro|";
 
cout << sample << endl;
cout << '[';
for (auto t : tokenize(sample, '|', '^')) {
cout << '"' << t << "\", ";
}
cout << ']' << endl;
 
return 0;
}</syntaxhighlight>
{{out}}
<pre>one^|uno||three^^^^|four^^^|^cuatro|
["one|uno", "", "three^^", "four^|cuatro", "", ]</pre>
 
=={{header|CLU}}==
<syntaxhighlight lang="clu">tokenize = iter (sep, esc: char, s: string) yields (string)
escape: bool := false
part: array[char] := array[char]$[]
for c: char in string$chars(s) do
if escape then
escape := false
array[char]$addh(part,c)
elseif c=esc then
escape := true
elseif c=sep then
yield(string$ac2s(part))
part := array[char]$[]
else
array[char]$addh(part,c)
end
end
yield(string$ac2s(part))
end tokenize
 
start_up = proc ()
po: stream := stream$primary_output()
testcase: string := "one^|uno||three^^^^|four^^^|^quatro|"
for part: string in tokenize('|', '^', testcase) do
stream$putl(po, "\"" || part || "\"")
end
end start_up</syntaxhighlight>
{{out}}
<pre>"one|uno"
""
"three^^"
"four^|quatro"
""</pre>
 
=={{header|COBOL}}==
<langsyntaxhighlight lang="cobol"> >>SOURCE FORMAT FREE
identification division.
program-id. 'tokenizewithescaping'.
Line 634 ⟶ 970:
.
end program 'tokenizewithescaping'.
</syntaxhighlight>
</lang>
 
{{out}}
Line 686 ⟶ 1,022:
 
=={{header|Common Lisp}}==
<langsyntaxhighlight lang="lisp">(defun split (input separator escape)
(flet ((make-string-buffer ()
(make-array 0 :element-type 'character :adjustable t :fill-pointer t)))
Line 708 ⟶ 1,044:
(defun main ()
(dolist (token (split "one^|uno||three^^^^|four^^^|^cuatro|" #\| #\^))
(format t "'~A'~%" token)))</langsyntaxhighlight>
{{out}}
<pre>'one|uno'
Line 718 ⟶ 1,054:
=={{header|D}}==
{{trans|Java}}
<langsyntaxhighlight Dlang="d">import std.stdio;
 
void main() {
Line 752 ⟶ 1,088:
output.put(token.data.idup);
return output.data;
}</langsyntaxhighlight>
 
{{out}}
Line 761 ⟶ 1,097:
{{trans|C#}}
 
<langsyntaxhighlight lang="dyalect">func String.tokenizeTokenize(separator, escape) {
var buffer = []
var escaping = false
for c in this {
if escaping {
buffer.addAdd(c)
escaping = false
} else if c == escape {
escaping = true
} else if c == separator {
yield buffer.flushFlush();
} else {
buffer.addAdd(c);
}
}
 
if buffer.lenLength() > 0 || this[this.lenLength() - 1] == separator {
yield buffer.flushFlush()
}
}
func Array.flushFlush() {
var str = String.concatConcat(values: this)
this.clearClear()
str
}
 
constlet testcase = "one^|uno||three^^^^|four^^^|^cuatro|";
for token in testcase.tokenizeTokenize(separator: '|', escape: '^') {
print(": \(token)")
}</langsyntaxhighlight>
 
{{out}}
Line 803 ⟶ 1,139:
=={{header|Elena}}==
{{trans|C#}}
ELENA 46.x :
<langsyntaxhighlight lang="elena">import extensions;
import extensions'routines;
import system'collections;
Line 818 ⟶ 1,154:
bool escaping := false;
self.forEach::(ch)
{
if (escaping)
{
buffer.write:(ch);
escaping := false
}
Line 836 ⟶ 1,172:
else
{
buffer.write:(ch)
}
};
Line 848 ⟶ 1,184:
public program()
{
testcase.tokenize("|", "^").forEach:(printingLn)
}</langsyntaxhighlight>
{{out}}
<pre>
Line 859 ⟶ 1,195:
 
=={{header|F_Sharp|F#}}==
<langsyntaxhighlight lang="fsharp">open System
open System.Text.RegularExpressions
 
Line 886 ⟶ 1,222:
|> Seq.map (unescape esc)
|> Seq.iter (fun s -> printfn "'%s'" s)
0</langsyntaxhighlight>
{{out}}
<pre>'one|uno'
Line 897 ⟶ 1,233:
This example uses Factor's <code>parser-combinators</code> vocabulary, which is modeled after Haskell's parser combinators. Page <tt>51</tt> of [https://bluishcoder.co.nz/factor-articles.pdf this pdf] contains a useful introduction to this vocabulary.
{{works with|Factor|0.99 2019-10-06}}
<langsyntaxhighlight lang="factor">USING: accessors kernel lists literals namespaces
parser-combinators prettyprint sequences strings ;
 
Line 919 ⟶ 1,255:
 
"one^|uno||three^^^^|four^^^|^cuatro|"
CHAR: | CHAR: ^ tokenize .</langsyntaxhighlight>
{{out}}
<pre>
{ "one|uno" "" "three^^" "four^|cuatro" "" }
</pre>
 
=={{header|Forth}}==
<syntaxhighlight lang="forth">variable 'src
variable #src
variable offset
 
: advance 1 offset +! ;
: chr@ offset @ 'src @ + c@ ;
: nextchr advance chr@ ;
: bound offset @ #src @ u< ;
: separator? dup [char] | = if drop cr else emit then ;
: escape? dup [char] ^ = if drop nextchr emit else separator? then ;
: tokenize 0 offset ! begin bound while nextchr escape? repeat ;
 
\ Test of function
Here 'src ! ," one^|uno||three^^^^|four^^^|^cuatro|" here 'src @ - #src !
page
cr ." #### start ####" cr tokenize cr ." #### End ####" cr
</syntaxhighlight>
{{output}}
<pre>
#### start ####
one|uno
 
three^^
four^|cuatro
 
#### End ####
</pre>
 
 
=={{header|Fortran}}==
First Fortran (1958) offered no facilities for inspecting or manipulating text, until Fortran IV when the <code>A</code> format code was introduced whereby text could be read or written from numeric variables. The difficulties and incompatibilities between different computers were eased with F77 that offered CHARACTER*n variables, though they are not quite strings that have a varying length. F95 introduces the ability to define a compound entity such as a string and F2003 standardised a version of strings whereby with each assignment to such a variable, it would be re-allocated with the required amount of storage. Otherwise, one proceeds with CHARACTER variables and an associated variable containing its current length as with <code>TOKEN</code> and <code>L</code>. However, when passed to subroutines (or functions) as a parameter, a CHARACTER variable is supplied along with a secret additional parameter giving the size of the variable, and this is stringlike, so long as there is no need to change the length. Thus, the length of parameter TEXT to subroutine SPLIT can be found via LEN(TEXT).
 
The source style is F90 simply for the convenience of having subroutine SPLOT defined within subroutine SPLIT so as to gain access to certain variables. If separate subroutines were to be used, then there would have to be parameters or COMMON variables, or, one could just replicate the code within SPLIT. A further F90 feature involves declaring the size of internal variable <code>TOKEN</code> to be <code>LEN(TEXT)</code>, which is surely the largest it could be. Otherwise, one would have to select some "surely big enough" value.<langsyntaxhighlight Fortranlang="fortran"> SUBROUTINE SPLIT(TEXT,SEP,ESC) !Identifies and prints tokens from within a text.
CHARACTER*(*) TEXT !To be scanned.
CHARACTER*(1) SEP !The only separator for tokens.
Line 971 ⟶ 1,337:
CALL SPLIT("one^|uno||three^^^^|four^^^|^cuatro|","|","^")
 
END</langsyntaxhighlight>
 
The output has the text of the tokens marked >thus<
Line 986 ⟶ 1,352:
 
In this example the DO-loop relentlessly steps through the text, and in general this would not be convenient. Normally, token identification proceeds within a much larger context where one would not discard the token immediately after it is isolated, and rather than copying the text hither and thither, one might prefer to identify it in-place, say with variables <code>L1</code> and <code>L2</code> identifying the start and end positions within the working area. In such a case there would no longer be a need for a variable <code>TOKEN</code> and the angst of deciding on a suitable maximum size. This would also make it easier in any error messages to show context and provenance. However, the bizarre miscegnation of "escape" sequences (especially confusing within text ''literals''), means that the source text does not necessarily constitute the text of the token.
 
 
=={{header|FreeBASIC}}==
{{trans|Ring}}
<syntaxhighlight lang="freebasic">Sub tokenize(cadena As String, separador As String, escape As String)
Dim As Integer campo = 1
Dim As Boolean escapando = false
Dim As String char
Print ""; campo; " ";
For i As Integer = 1 To Len(cadena)
char = Mid(cadena, i, 1)
If escapando Then
Print char;
escapando = false
Else
Select Case char
Case separador
Print
campo += 1
Print ""; campo; " ";
Case escape
escapando = true
Case Else
Print char;
End Select
End If
Next i
Print
End Sub
 
tokenize("one^|uno||three^^^^|four^^^|^cuatro|", "|", "^")
Sleep</syntaxhighlight>
{{out}}
<pre>
Igual que la entrada de Ring.
</pre>
 
 
=={{header|Go}}==
<langsyntaxhighlight lang="go">package main
 
import (
Line 1,031 ⟶ 1,435:
fmt.Printf("Tokens: %q\n", tokens)
}
}</langsyntaxhighlight>
{{out}}
<pre>
Line 1,042 ⟶ 1,446:
=== Deterministic Finite Automaton ===
 
<langsyntaxhighlight lang="haskell">splitEsc :: (Foldable t1, Eq t) => t -> t -> t1 t -> [[t]]
splitEsc sep esc = reverse . map reverse . snd . foldl process (0, [[]])
where process (st, r:rs) ch
Line 1,048 ⟶ 1,452:
| st == 0 && ch == sep = (0, []:r:rs)
| st == 1 && sep == esc && ch /= sep = (0, [ch]:r:rs)
| otherwise = (0, (ch:r):rs)</langsyntaxhighlight>
 
{{out}}
Line 1,068 ⟶ 1,472:
Constant in space (~ O(k), where k -- is token length), as fast as DFA-based solution.
 
<langsyntaxhighlight lang="haskell">{-#Language LambdaCase #-}
import Conduit
 
Line 1,078 ⟶ 1,482:
Just ch | notEsc && ch == esc -> go False b
| notEsc && ch == sep -> yield b >> go True []
| otherwise -> go True (ch:b)</langsyntaxhighlight>
 
This new conduit could be used in a pipeline as follows:
 
<langsyntaxhighlight lang="haskell">main = runConduit $
yieldMany "one^|uno||three^^^^|four^^^|^cuatro|"
.| splitEscC '|' '^'
.| mapM_C print</langsyntaxhighlight>
 
<pre>λ> main
Line 1,096 ⟶ 1,500:
===Alternative===
This is essentially equivalent to the first (DFA) example, but, though possibly less elegant than the guard idiom, appears to be fractionally faster with larger (eg 180k) test strings.
<langsyntaxhighlight lang="haskell">import Data.Bool (bool)
 
------------------ TOKENIZE WITH ESCAPING ----------------
 
tokenize :: Char -> Char -> String -> [String]
tokenize delim esc str = reverse $ reverse <$> (token : list)
reverse $
reverse <$> (token : list)
where
(token, list, _) =
foldr
( \x (aToken, aList, aEsc) ->
let literal = not aEsc
isEsc = literal && (x == esc)
in bool
( bool (x : aToken) aToken isEsc, aList, isEsc)
([], aToken : aList, isEsc)
(literal && x == delim))isEsc
)
([], aToken : aList, isEsc)
(literal && x == delim)
)
([], [], False)
(reverse str)
 
--------------------------- TEST -------------------------
main :: IO ()
main =
main = mapM_ print $ tokenize '|' '^' "one^|uno||three^^^^|four^^^|^cuatro|"</lang>
mapM_ print $
 
tokenize
'|'
'^'
"one^|uno||three^^^^|four^^^|^cuatro|"</syntaxhighlight>
{{Out}}
<pre>"one|uno"
Line 1,126 ⟶ 1,543:
 
From the python example:
<syntaxhighlight lang="j">
<lang J>
tokenize1=: tokenize =: '^|'&$: :(4 : 0)
'ESC SEP' =. x
Line 1,149 ⟶ 1,566:
RESULT =. RESULT , < TOKEN
)
</syntaxhighlight>
</lang>
<pre>
tokenize 'one^|uno||three^^^^|four^^^|^cuatro|'
Line 1,160 ⟶ 1,577:
Here's a somewhat more efficient approach (over 100 times faster on a 100k textual example):
 
<langsyntaxhighlight Jlang="j">tokenize2=: tokenize=:3 :0
'^|' tokenize2 y NB. task default escape and separator
:
Line 1,169 ⟶ 1,586:
T=. (#y){. 1,}.S NB. token beginnings
(T<;.1 K)#&.>T<;.1 y
)</langsyntaxhighlight>
 
Example use:
 
<langsyntaxhighlight Jlang="j"> '^|' tokenize 'one^|uno||three^^^^|four^^^|^cuatro|'
┌───────┬┬───────┬────────────┬┐
│one|uno││three^^│four^|cuatro││
└───────┴┴───────┴────────────┴┘</langsyntaxhighlight>
 
 
Solution invoking the sequential machine primitive verb.[[http://jsoftware.com/pipermail/programming/2014-December/040658.html|See this thread.]]<langsyntaxhighlight Jlang="j">charTokens =: (0;(3 2 2$(2 1 1 1 2 2 1 2 1 0 1 0));<<'^')&;: NB. sequential machine
splitTokens =: ((<,'|')&= <;._1 ])@:((<,'|'),])
removeExtra =: (}.^:(1<#)) L:0
tokenize3=: tokenize=: ; each @: (removeExtra @: splitTokens @: charTokens)</langsyntaxhighlight>Example use:<syntaxhighlight lang J="j"> t=: 'one^|uno||three^^^^|four^^^|^cuatro|'
 
tokenize t
Line 1,190 ⟶ 1,607:
 
$tokenize t
5</langsyntaxhighlight>
 
Relative efficiencies:
 
<langsyntaxhighlight Jlang="j"> txt=: 1e5$'one^|uno||three^^^^|four^^^|^cuatro|'
(%"1 <./) timespacex every 'tokenize1 txt';'tokenize2 txt';'tokenize3 txt'
132.856 1
1 7.73534
8.29568 19.9766</langsyntaxhighlight>
 
So tokenize2 is the fastest, while tokenize1 uses the least amount of memory. Also, tokenize1 is the slowest and tokenize3 uses the most memory. (First column is relative time used, second is relative space used, rows correspond to implementations.)
Line 1,206 ⟶ 1,623:
{{trans|Go}}
{{works with|Java|7}}
<langsyntaxhighlight lang="java">import java.util.*;
 
public class TokenizeStringWithEscaping {
Line 1,249 ⟶ 1,666:
return tokens;
}
}</langsyntaxhighlight>
 
<pre>[one|uno, , three^^, four^|cuatro, ]</pre>
Line 1,256 ⟶ 1,673:
===ES5===
====Iterative====
<langsyntaxhighlight JavaScriptlang="javascript">function tokenize(s, esc, sep) {
for (var a=[], t='', i=0, e=s.length; i<e; i+=1) {
var c = s.charAt(i)
Line 1,269 ⟶ 1,686:
var s = 'one^|uno||three^^^^|four^^^|^cuatro|'
document.write(s, '<br>')
for (var a=tokenize(s,'^','|'), i=0; i<a.length; i+=1) document.write(i, ': ', a[i], '<br>')</langsyntaxhighlight>
{{out}}
<pre>one^|uno||three^^^^|four^^^|^cuatro|
Line 1,280 ⟶ 1,697:
 
====Functional====
<langsyntaxhighlight JavaScriptlang="javascript">(function () {
'use strict';
 
Line 1,316 ⟶ 1,733:
.join('\n');
 
})();</langsyntaxhighlight>
{{Out}}
<pre>one|uno
Line 1,326 ⟶ 1,743:
 
===ES6===
 
====Hand-parsed====
 
{{Trans|Haskell}} (Single fold version)
<langsyntaxhighlight JavaScriptlang="javascript">((() => {
 
// tokenize :: String -> Character -> Character -> [String]
Line 1,362 ⟶ 1,782:
.map(show)
.join('\n');
}))();</langsyntaxhighlight>
 
{{Out}}
Line 1,370 ⟶ 1,790:
"four^|cuatro"
""</pre>
 
====Parser combinators====
Defining the function as a composition of generics from a parser combinator library:
 
<syntaxhighlight lang="javascript">(() => {
'use strict';
 
// ------ TOKENIZATION OF A STRING WITH ESCAPES ------
 
// tokenizedWithEscapes :: Char -> Char ->
// String -> [String]
const tokenizedWithEscapes = esc =>
// A list of tokens in a given string,
// where the separator character is sep
// and any character may be escaped by
// a preceding esc character.
sep => compose(
concatMap(fst),
parse(
sepBy(
takeWhileEscP(esc)(
constant(true)
)(
ne(sep)
)
)(char(sep))
)
);
 
// ---------------------- TEST -----------------------
// main :: IO ()
const main = () =>
JSON.stringify(
tokenizedWithEscapes('^')('|')(
'one^|uno||three^^^^|four^^^|^cuatro|'
),
null, 2
);
// -->
// [
// "one|uno",
// "",
// "three^^",
// "four^|cuatro",
// ""
// ]
 
// ----------- GENERIC PARSER COMBINATORS ------------
 
// Parser :: String -> [(a, String)] -> Parser a
const Parser = f =>
// A function lifted into a Parser object.
({
type: 'Parser',
parser: f
});
 
 
// altP (<|>) :: Parser a -> Parser a -> Parser a
const altP = p =>
// p, or q if p doesn't match.
q => Parser(s => {
const xs = parse(p)(s);
return 0 < xs.length ? (
xs
) : parse(q)(s);
});
 
 
// anyChar :: () -> Parser Char
const anyChar = () =>
// A single character.
Parser(
s => 0 < s.length ? [
Tuple(s[0])(
s.slice(1)
)
] : []
);
 
 
// apP <*> :: Parser (a -> b) -> Parser a -> Parser b
const apP = pf =>
// A new parser obtained by the application
// of a Parser-wrapped function,
// to a Parser-wrapped value.
p => Parser(
s => parse(pf)(s).flatMap(
vr => parse(
fmapP(vr[0])(p)
)(vr[1])
)
);
 
 
// bindP (>>=) :: Parser a ->
// (a -> Parser b) -> Parser b
const bindP = p =>
// A new parser obtained by the application of
// a function to a Parser-wrapped value.
// The function must enrich its output, lifting it
// into a new Parser.
// Allows for the nesting of parsers.
f => Parser(
s => parse(p)(s).flatMap(
tpl => parse(f(tpl[0]))(tpl[1])
)
);
 
 
// char :: Char -> Parser Char
const char = x =>
// A particular single character.
satisfy(c => x == c);
 
 
// fmapP :: (a -> b) -> Parser a -> Parser b
const fmapP = f =>
// A new parser derived by the structure-preserving
// application of f to the value in p.
p => Parser(
s => parse(p)(s).flatMap(
first(f)
)
);
 
 
// liftA2P :: (a -> b -> c) ->
// Parser a -> Parser b -> Parser c
const liftA2P = op =>
// The binary function op, lifted
// to a function over two parsers.
p => apP(fmapP(op)(p));
 
 
// many :: Parser a -> Parser [a]
const many = p => {
// Zero or more instances of p.
// Lifts a parser for a simple type of value
// to a parser for a list of such values.
const some_p = p =>
liftA2P(
x => xs => [x].concat(xs)
)(p)(many(p));
return Parser(
s => parse(
0 < s.length ? (
altP(some_p(p))(pureP(''))
) : pureP('')
)(s)
);
};
 
 
// parse :: Parser a -> String -> [(a, String)]
const parse = p =>
// The result of parsing a string with p.
p.parser;
 
 
// pureP :: a -> Parser a
const pureP = x =>
// The value x lifted, unchanged,
// into the Parser monad.
Parser(s => [Tuple(x)(s)]);
 
 
// satisfy :: (Char -> Bool) -> Parser Char
const satisfy = test =>
// Any character for which the
// given predicate returns true.
Parser(
s => 0 < s.length ? (
test(s[0]) ? [
Tuple(s[0])(s.slice(1))
] : []
) : []
);
 
 
// sepBy :: Parser a -> Parser b -> Parser [a]
const sepBy = p =>
// Zero or more occurrences of p, as
// separated by (discarded) instances of sep.
sep => altP(
sepBy1(p)(sep)
)(
pureP([])
);
 
 
// sepBy1 :: Parser a -> Parser b -> Parser [a]
const sepBy1 = p =>
// One or more occurrences of p, as
// separated by (discarded) instances of sep.
sep => bindP(
p
)(x => bindP(
many(
thenP(sep)(
bindP(p)(pureP)
)
)
)(xs => pureP([x].concat(xs))));
 
 
// takeWhileEscP :: Char -> (Char -> Bool) ->
// (Char -> Bool) -> Parser Text
const takeWhileEscP = esc =>
escTest => test => {
// Longest prefix, including any escaped
// characters, in which escTest returns
// true for all escaped characters, and
// test returns true for all other chars.
const plain = takeWhileP(
c => (esc !== c) && test(c)
);
const escaped = thenBindP(
char(esc)
)(
anyChar()
)(x => bindP(
plain
)(
compose(pureP, cons(x))
));
return bindP(
plain
)(x => bindP(
many(escaped)
)(xs => pureP(concat([x].concat(xs)))));
};
 
 
// takeWhileP :: (Char -> Bool) -> Parser String
const takeWhileP = p =>
// The largest prefix in which p is
// true over all the characters.
Parser(
compose(
pureList,
first(concat),
span(p)
)
);
 
 
// thenBindP :: Parser a -> Parser b ->
// (b -> Parser c) Parser c
const thenBindP = o =>
// A combination of thenP and bindP in which a
// preliminary parser consumes text and discards
// its output, before any output of a subsequent
// parser is bound.
p => f => Parser(
s => parse(o)(s).flatMap(
vr => parse(p)(vr[1]).flatMap(
tpl => parse(f(tpl[0]))(tpl[1])
)
)
);
 
 
// thenP (>>) :: Parser a -> Parser b -> Parser b
const thenP = o =>
// A composite parser in which o just consumes text
// and then p consumes more and returns a value.
p => Parser(
s => parse(o)(s).flatMap(
vr => parse(p)(vr[1])
)
);
 
 
// --------------------- GENERIC ---------------------
 
// Tuple (,) :: a -> b -> (a, b)
const Tuple = a =>
b => ({
type: 'Tuple',
'0': a,
'1': b,
length: 2
});
 
 
// compose (<<<) :: (b -> c) -> (a -> b) -> a -> c
const compose = (...fs) =>
// A function defined by the right-to-left
// composition of all the functions in fs.
fs.reduce(
(f, g) => x => f(g(x)),
x => x
);
 
 
// concat :: [[a]] -> [a]
// concat :: [String] -> String
const concat = xs => (
ys => 0 < ys.length ? (
ys.every(Array.isArray) ? (
[]
) : ''
).concat(...ys) : ys
)(list(xs));
 
 
// concatMap :: (a -> [b]) -> [a] -> [b]
const concatMap = f =>
// List monad bind operator.
xs => xs.flatMap(f);
 
 
// cons :: a -> [a] -> [a]
const cons = x =>
// A list constructed from the item x,
// followed by the existing list xs.
xs => Array.isArray(xs) ? (
[x].concat(xs)
) : 'GeneratorFunction' !== xs
.constructor.constructor.name ? (
x + xs
) : ( // cons(x)(Generator)
function* () {
yield x;
let nxt = xs.next();
while (!nxt.done) {
yield nxt.value;
nxt = xs.next();
}
}
)();
 
 
// constant :: a -> b -> a
const constant = k =>
_ => k;
 
 
// first :: (a -> b) -> ((a, c) -> (b, c))
const first = f =>
// A simple function lifted to one which applies
// to a tuple, transforming only its first item.
xy => Tuple(f(xy[0]))(
xy[1]
);
 
 
// fst :: (a, b) -> a
const fst = tpl =>
// First member of a pair.
tpl[0];
 
 
// list :: StringOrArrayLike b => b -> [a]
const list = xs =>
// xs itself, if it is an Array,
// or an Array derived from xs.
Array.isArray(xs) ? (
xs
) : Array.from(xs || []);
 
 
// map :: (a -> b) -> [a] -> [b]
const map = f =>
// The list obtained by applying f
// to each element of xs.
// (The image of xs under f).
xs => [...xs].map(f);
 
 
// ne :: a -> a -> Bool
const ne = a =>
b => a !== b;
 
 
// pureList :: a -> [a]
const pureList = x => [x];
 
 
// span p xs is equivalent to (takeWhile p xs, dropWhile p xs)
// span :: (a -> Bool) -> [a] -> ([a], [a])
const span = p =>
// Longest prefix of xs consisting of elements which
// all satisfy p, tupled with the remainder of xs.
xs => {
const
ys = 'string' !== typeof xs ? (
list(xs)
) : xs,
iLast = ys.length - 1;
return splitAt(
until(
i => iLast < i || !p(ys[i])
)(i => 1 + i)(0)
)(ys);
};
 
 
// splitAt :: Int -> [a] -> ([a], [a])
const splitAt = n =>
xs => Tuple(xs.slice(0, n))(
xs.slice(n)
);
 
 
// unlines :: [String] -> String
const unlines = xs =>
// A single string formed by the intercalation
// of a list of strings with the newline character.
xs.join('\n');
 
 
// until :: (a -> Bool) -> (a -> a) -> a -> a
const until = p =>
f => x => {
let v = x;
while (!p(v)) v = f(v);
return v;
};
 
// MAIN ---
return main();
})();</syntaxhighlight>
{{Out}}
<pre>[
"one|uno",
"",
"three^^",
"four^|cuatro",
"",
""
]</pre>
 
=={{header|jq}}==
{{works with| jq|1.5}}
<langsyntaxhighlight lang="jq"># Tokenize the input using the string "escape" as the prefix escape string
def tokenize(separator; escape):
 
Line 1,408 ⟶ 2,261:
| map( if type == "string" then split(escape) else . end)
| flatten
| reform ;</langsyntaxhighlight>
 
'''Example:'''
<langsyntaxhighlight lang="jq">"one^|uno||three^^^^|four^^^|^cuatro|" | tokenize("|"; "^")</langsyntaxhighlight>
 
{{out}}
<langsyntaxhighlight lang="sh">$ jq -n -f tokenize.jq
[
"one|uno",
Line 1,421 ⟶ 2,274:
"four^|cuatro",
""
]</langsyntaxhighlight>
 
=={{header|Julia}}==
Line 1,427 ⟶ 2,280:
{{trans|Kotlin}}
 
<langsyntaxhighlight lang="julia">function tokenize2(s::AbstractString, sep::Char, esc::Char)
SPE = "\ufffe"
SPF = "\uffff"
Line 1,439 ⟶ 2,292:
end
 
@show tokenize2("one^|uno||three^^^^|four^^^|^cuatro|", '|', '^')</langsyntaxhighlight>
 
{{out}}
Line 1,445 ⟶ 2,298:
 
=={{header|Kotlin}}==
<langsyntaxhighlight lang="scala">// version 1.1.3
 
const val SPE = "\ufffe" // unused unicode char in Specials block
Line 1,465 ⟶ 2,318:
val items = tokenize(str, sep, esc)
for (item in items) println(if (item.isEmpty()) "(empty)" else item)
}</langsyntaxhighlight>
 
{{out}}
Line 1,477 ⟶ 2,330:
 
=={{header|Lingo}}==
<langsyntaxhighlight lang="lingo">-- in some movie script
 
on tokenize (str, sep, esc)
Line 1,519 ⟶ 2,372:
end repeat
return str
end</langsyntaxhighlight>
 
<langsyntaxhighlight lang="lingo">str = "one^|uno||three^^^^|four^^^|^cuatro|"
sep = "|"
esc = "^"
put tokenize(str, sep, esc)
-- ["one|uno", "", "three^^", "four^|cuatro", ""]</langsyntaxhighlight>
 
=={{header|Lua}}==
<langsyntaxhighlight Lualang="lua">function tokenise (str, sep, esc)
local strList, word, escaped, ch = {}, "", false
for pos = 1, #str do
Line 1,560 ⟶ 2,413:
for k, v in pairs(tokenise(testStr, testSep, testEsc)) do
print(k, v)
end</langsyntaxhighlight>
{{out}}
<pre>1 one|uno
Line 1,567 ⟶ 2,420:
4 four^|cuatro
5</pre>
 
=={{header|Mathematica}} / {{header|Wolfram Language}}==
<syntaxhighlight lang="mathematica">ClearAll[Tokenize]
Tokenize[str_String, escape_String : "^", sep_String : "|"] :=
Module[{results = {}, token = "", state = 0, a},
a = Characters[str];
Do[
If[state == 0,
Switch[c,
escape,
state = 1
,
sep,
AppendTo[results, token];
token = "";
,
_,
token = token <> c;
]
,
If[state == 1,
token = token <> c;
state = 0;
]
]
,
{c, a}
];
AppendTo[results, token];
results
]
Tokenize["one^|uno||three^^^^|four^^^|^cuatro|"]</syntaxhighlight>
{{out}}
<pre>{"one|uno", "", "three^^", "four^|cuatro", ""}</pre>
 
=={{header|Nim}}==
<langsyntaxhighlight lang="nim">import streams
 
proc tokenzietokenize(s: Stream, sep: static[char] = '|', esc: static[char] = '^'): seq[string] =
var buff = ""
while not s.atEnd():
let c = readChar s.readChar
case c
of sep:
Line 1,582 ⟶ 2,470:
buff.add s.readChar
else:
buff &=.add c
result.add buff
 
for i, s in tokenzietokenize(newStringStream "one^|uno||three^^^^|four^^^|^cuatro|"):
echo i, ":", s
</syntaxhighlight>
</lang>
{{out}}
<pre>0:one|uno
Line 1,598 ⟶ 2,486:
=={{header|OCaml}}==
 
<langsyntaxhighlight lang="ocaml">let split_with_escaping ~esc ~sep s =
let len = String.length s in
let buf = Buffer.create 16 in
Line 1,615 ⟶ 2,503:
end
in
loop 0</langsyntaxhighlight>
 
Example:
<langsyntaxhighlight lang="ocaml">let res = split_with_escaping ~esc:'^' ~sep:'|' "one^|uno||three^^^^|four^^^|^cuatro|";;
val res : string list = ["one|uno"; ""; "three^^"; "four^|cuatro"; ""]</langsyntaxhighlight>
 
 
=={{header|Perl}}==
Line 1,627 ⟶ 2,514:
The built-in <code>split</code> function can be used with a regex that matches the delimiter ''(although [http://perldoc.perl.org/perlre.html#Special-Backtracking-Control-Verbs advanced backtracking control verbs] are needed to skip escaped delimiters)'':
 
<langsyntaxhighlight lang="perl">sub tokenize {
my ($string, $sep, $esc) = (shift, quotemeta shift, quotemeta shift);
my @fields = split /$esc . (*SKIP)(*FAIL) | $sep/sx, $string, -1;
return map { s/$esc(.)/$1/gsr } @fields;
}</langsyntaxhighlight>
 
A more traditional approach is to parse the input string step by step ''(using a repeatedly-matching regex of the form [http://perldoc.perl.org/perlretut.html#Global-matching <code>/\G.../g</code>])'', and throw away the separators ''(which can be done implicitly using [http://perldoc.perl.org/perlre.html#%28?%3C=pattern%29-\K \K])'':
 
<langsyntaxhighlight lang="perl"> my @fields = $string =~ /\G (?:^ | $sep) \K (?: [^$sep$esc] | $esc .)*/gsx;</langsyntaxhighlight>
 
In both cases, stripping the escape characters happens as a separate step.
Line 1,642 ⟶ 2,529:
Testing:
 
<langsyntaxhighlight lang="perl">print "'$_'\n" for tokenize("one^|uno||three^^^^|four^^^|^cuatro|", '|', '^');</langsyntaxhighlight>
 
{{out}}
Line 1,652 ⟶ 2,539:
''
</pre>
 
=={{header|Perl 6}}==
 
<lang perl6>sub tokenize ($string, :$sep!, :$esc!) {
return $string.match(/([ <!before $sep | $esc> . | $esc . ]*)+ % $sep/)\
.[0].map(*.subst: /$esc )> ./, '', :g);
}
 
say "'$_'" for tokenize 'one^|uno||three^^^^|four^^^|^cuatro|', sep => '|', esc => '^';</lang>
 
{{out}}
<pre>
'one|uno'
''
'three^^'
'four^|cuatro'
''
</pre>
 
Notable Perl 6 innovations that make this different from the equivalent [[#Perl]] solution:
 
* string variables can be safely interpolated into regexes without having to 'quotemeta' them
* regexes matches return a nested <code>Match</code> object which allows retrieving ''all'' results for a given capture group ''(rather than just the last thing that it matched)'', thus getting rid of the need for repeated global matching
* the <code>&lt;field&gt;+ % &lt;delimiter&gt;</code> regex construct allows handling the delimiters in a more idiomatic way
* the <code>)&gt;</code> regex construct can be used to exclude anything that follows it from the returned match result
 
=={{header|Phix}}==
<!--<syntaxhighlight lang="phix">(phixonline)-->
<lang Phix>function tokenize(string s, integer sep, integer esc)
<span style="color: #008080;">function</span> <span style="color: #000000;">tokenize</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">sep</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">esc</span><span style="color: #0000FF;">)</span>
sequence ret = {}
<span style="color: #004080;">sequence</span> <span style="color: #000000;">ret</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span>
string this = ""
<span style="color: #004080;">string</span> <span style="color: #000000;">word</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span>
integer skip = 0
<span style="color: #004080;">integer</span> <span style="color: #000000;">skip</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
 
if length(s)!=0 then
<span style="color: #008080;">if</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">s</span><span style="color: #0000FF;">)!=</span><span style="color: #000000;">0</span> <span style="color: #008080;">then</span>
for i=1 to length(s) do
<span style="color: #008080;">for</span> <span style="color: #000000;">i</span><span style="color: #0000FF;">=</span><span style="color: #000000;">1</span> <span style="color: #008080;">to</span> <span style="color: #7060A8;">length</span><span style="color: #0000FF;">(</span><span style="color: #000000;">s</span><span style="color: #0000FF;">)</span> <span style="color: #008080;">do</span>
integer si = s[i]
<span style="color: #004080;">integer</span> <span style="color: #000000;">si</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">[</span><span style="color: #000000;">i</span><span style="color: #0000FF;">]</span>
if skip then
<span style="color: #008080;">if</span> <span style="color: #000000;">skip</span> <span style="color: #008080;">then</span>
this &= si
<span style="color: #000000;">word</span> <span style="color: #0000FF;">&=</span> <span style="color: #000000;">si</span>
skip = 0
<span style="color: #000000;">skip</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">0</span>
elsif si=esc then
<span style="color: #008080;">elsif</span> <span style="color: #000000;">si</span><span style="color: #0000FF;">=</span><span style="color: #000000;">esc</span> <span style="color: #008080;">then</span>
skip = 1
<span style="color: #000000;">skip</span> <span style="color: #0000FF;">=</span> <span style="color: #000000;">1</span>
elsif si=sep then
<span style="color: #008080;">elsif</span> <span style="color: #000000;">si</span><span style="color: #0000FF;">=</span><span style="color: #000000;">sep</span> <span style="color: #008080;">then</span>
ret = append(ret,this)
<span style="color: #000000;">ret</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">ret</span><span style="color: #0000FF;">,</span><span style="color: #000000;">word</span><span style="color: #0000FF;">)</span>
this = ""
<span style="color: #000000;">word</span> <span style="color: #0000FF;">=</span> <span style="color: #008000;">""</span>
else
<span this &style="color: si#008080;">else</span>
<span style="color: #000000;">word</span> <span style="color: #0000FF;">&=</span> <span style="color: #000000;">si</span>
end if
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
end for
<span style="color: #008080;">end</span> <span style="color: #008080;">for</span>
ret = append(ret,this)
<span style="color: #000000;">ret</span> <span style="color: #0000FF;">=</span> <span style="color: #7060A8;">append</span><span style="color: #0000FF;">(</span><span style="color: #000000;">ret</span><span style="color: #0000FF;">,</span><span style="color: #000000;">word</span><span style="color: #0000FF;">)</span>
end if
<span style="color: #008080;">end</span> <span style="color: #008080;">if</span>
return ret
<span style="color: #008080;">return</span> <span style="color: #000000;">ret</span>
end function
<span style="color: #008080;">end</span> <span style="color: #008080;">function</span>
 
?tokenize("one^|uno||three^^^^|four^^^|^cuatro|",'|','^')</lang>
<span style="color: #0000FF;">?</span><span style="color: #000000;">tokenize</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"one^|uno||three^^^^|four^^^|^cuatro|"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'|'</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'^'</span><span style="color: #0000FF;">)</span>
<!--</syntaxhighlight>-->
{{Out}}
<pre>
Line 1,711 ⟶ 2,575:
 
=={{header|PicoLisp}}==
<langsyntaxhighlight PicoLisplang="picolisp">(de tokenize (Str Sep Esc)
(split
(make
Line 1,720 ⟶ 2,584:
((= C Sep) (link 0))
(T (link C)) ) ) ) )
0 ) )</langsyntaxhighlight>
Test:
<langsyntaxhighlight PicoLisplang="picolisp">(for (I . S) (tokenize "one\^|uno||three\^\^\^\^|four\^\^\^|\^cuatro|" "|" "\^")
(prinl I ": " S) )</langsyntaxhighlight>
Output:
<pre>1: one|uno
Line 1,732 ⟶ 2,596:
 
=={{header|PowerShell}}==
<syntaxhighlight lang="powershell">
<lang PowerShell>
function Split-String ([string]$String, [char]$Separator, [char]$Escape)
{
Line 1,762 ⟶ 2,626:
if ($String[-1] -eq $Separator) {[String]::Empty}
}
</syntaxhighlight>
</lang>
<syntaxhighlight lang="powershell">
<lang PowerShell>
Split-String "one^|uno||three^^^^|four^^^|^cuatro|" -Separator "|" -Escape "^" | ForEach-Object `
-Begin {$n = 0} `
-Process {$n+= 1; "{0}: {1}" -f $n, $_}
</syntaxhighlight>
</lang>
{{Out}}
<pre>
Line 1,779 ⟶ 2,643:
=={{header|Python}}==
===Procedural===
<langsyntaxhighlight lang="python">def token_with_escape(a, escape = '^', separator = '|'):
'''
Issue python -m doctest thisfile.py to run the doctests.
Line 1,789 ⟶ 2,653:
token = ''
state = 0
for c in a:
if state == 0:
if c == escape:
Line 1,802 ⟶ 2,666:
state = 0
result.append(token)
return result</langsyntaxhighlight>
 
===Functional===
{{Works with|Python|3}}
<langsyntaxhighlight lang="python">'''Tokenize a string with escaping'''
 
from functools import reduce
Line 1,847 ⟶ 2,711:
# MAIN ---
if __name__ == '__main__':
main()</langsyntaxhighlight>
{{Out}}
<pre>['one|uno', '', 'three^^', 'four^|cuatro', '']</pre>
 
===Regex-based===
 
====Using <code>Scanner</code>====
 
The python <code>re</code> library has a handy class <code>Scanner</code> which is intended precisely for this use-case.
It takes a list of pairs '''regex, action''' and whenever it encounters '''regex''' in the input, it executes '''action'''.
This allows us to solve this task very efficiently with minimum effort, the hardest part being the correct definition of the regular expressions.
 
The following code also illustrates an important feature of Python ‒ nested functions with closures.
Owing to this feature, the inner functions, such as <code>start_new_token</code>, are able to access the local variable <code>tokens</code> of their enclosing function <code>tokenize</code>.
For the inner function, the name <code>tokens</code> is ''nonlocal'', and is in the ''enclosing scope'' of the inner function (as opposed to the parameters <code>scanner</code> and <code>substring</code>, which are in the local scope).
 
<syntaxhighlight lang="python">import re
 
STRING = 'one^|uno||three^^^^|four^^^|^cuatro|'
 
def tokenize(string=STRING, escape='^', separator='|'):
 
escape, separator = map(re.escape, (escape, separator))
 
tokens = ['']
 
def start_new_token(scanner, substring):
tokens.append('')
 
def add_escaped_char(scanner, substring):
char = substring[1]
tokens[-1] += char
 
def add_substring(scanner, substring):
tokens[-1] += substring
 
re.Scanner([
# an escape followed by a character produces that character
(fr'{escape}.', add_escaped_char),
 
# when encountering a separator not preceded by an escape,
# start a new token
(fr'{separator}', start_new_token),
 
# a sequence of regular characters (i.e. not escape or separator)
# is just appended to the token
(fr'[^{escape}{separator}]+', add_substring),
]).scan(string)
 
return tokens
 
 
if __name__ == '__main__':
print(list(tokenize()))</syntaxhighlight>
 
Output is the same as in the functional Python version above.
 
 
====Simpler version with preprocessing====
 
This version does not require any extra state, such as the <code>token</code> list in the Scanner-based version above.
It first preprocesses the input, since Python does not support variable-length lookbehind assertions.
Then it works only with the primitive regex operations <code>re.findall</code> and <code>re.sub</code>.
Note that the regex used here is compiled with the <code>re.VERBOSE</code> flag.
This allows us to write the regex on several lines (since unescaped whitespace is ignored in this mode), and use comments inside the regex (starting with <code>#</code>).
 
<syntaxhighlight lang="python">import re
 
STRING = 'one^|uno||three^^^^|four^^^|^cuatro|'
 
def tokenize(string=STRING, escape='^', separator='|'):
 
re_escape, re_separator = map(re.escape, (escape, separator))
 
# token regex
regex = re.compile(fr'''
# lookbehind: a token must be preceded by a separator
# (note that `(?<=^|{re_separator})` doesn't work in Python)
(?<={re_separator})
 
# a token consists either of an escape sequence,
# or a regular (non-escape, non-separator) character,
# repeated arbitrarily many times (even zero)
(?:{re_escape}.|[^{re_escape}{re_separator}])*
''',
flags=re.VERBOSE
)
 
# since each token must start with a separator,
# we must add an extra separator at the beginning of input
preprocessed_string = separator + string
 
for almost_token in regex.findall(preprocessed_string):
# now get rid of escape characters: '^^' -> '^' etc.
token = re.sub(fr'{re_escape}(.)', r'\1', almost_token)
yield token
 
if __name__ == '__main__':
print(list(tokenize()))</syntaxhighlight>
 
=={{header|Racket}}==
<langsyntaxhighlight lang="racket">#lang racket/base
(require racket/match)
 
Line 1,876 ⟶ 2,836:
(report-input-output "|")
(report-input-output "^")
(report-input-output ".")</langsyntaxhighlight>
 
{{out}}
Line 1,893 ⟶ 2,853:
Input: "."
Output: (".")</pre>
 
=={{header|Raku}}==
(formerly Perl 6)
 
<syntaxhighlight lang="raku" line>sub tokenize ($string, :$sep!, :$esc!) {
return $string.match(/([ <!before $sep | $esc> . | $esc . ]*)+ % $sep/)\
.[0].map(*.subst: /$esc )> ./, '', :g);
}
 
say "'$_'" for tokenize 'one^|uno||three^^^^|four^^^|^cuatro|', sep => '|', esc => '^';</syntaxhighlight>
 
{{out}}
<pre>
'one|uno'
''
'three^^'
'four^|cuatro'
''
</pre>
 
Notable Raku innovations that make this different from the equivalent [[#Perl]] solution:
 
* string variables can be safely interpolated into regexes without having to 'quotemeta' them
* regexes matches return a nested <code>Match</code> object which allows retrieving ''all'' results for a given capture group ''(rather than just the last thing that it matched)'', thus getting rid of the need for repeated global matching
* the <code>&lt;field&gt;+ % &lt;delimiter&gt;</code> regex construct allows handling the delimiters in a more idiomatic way
* the <code>)&gt;</code> regex construct can be used to exclude anything that follows it from the returned match result
 
=={{header|REXX}}==
===IF/THEN logic===
<langsyntaxhighlight lang="rexx">/*REXX program demonstrates tokenizing and displaying a string with escaping sequences. */
str = 'one^|uno||three^^^^|four^^^|^cuatro|' /*the character string to be tokenized.*/
esc = '^' /* " escape character to be used. */
Line 1,913 ⟶ 2,899:
exit /*stick a fork in it, we're all done. */
/*──────────────────────────────────────────────────────────────────────────────────────*/
show: say '[length'right(length(out),4)"]" out; out=; return</langsyntaxhighlight>
'''output'''
<pre>
Line 1,925 ⟶ 2,911:
===SELECT logic===
This REXX version also shows a scale in the output.
<langsyntaxhighlight lang="rexx">/*REXX program demonstrates tokenizing and displaying a string with escaping sequences. */
str = 'one^|uno||three^^^^|four^^^|^cuatro|' /*the character string to be tokenized.*/
esc = '^' /* " escape character to be used. */
Line 1,949 ⟶ 2,935:
exit /*stick a fork in it, we're all done. */
/*──────────────────────────────────────────────────────────────────────────────────────*/
show: say '[length'right(length($),4)"]" $; $=; return</langsyntaxhighlight>
'''output'''
<pre>
Line 1,965 ⟶ 2,951:
 
=={{header|Ring}}==
<langsyntaxhighlight lang="ring">
tokenize("one^|uno||three^^^^|four^^^|^cuatro|", "|", "^")
 
Line 1,991 ⟶ 2,977:
next
see nl
</syntaxhighlight>
</lang>
Output:
<pre>
Line 2,007 ⟶ 2,993:
{{trans|Perl}}
 
<langsyntaxhighlight lang="ruby">
def tokenize(string, sep, esc)
sep = Regexp.escape(sep)
Line 2,018 ⟶ 3,004:
p tokenize('one^|uno||three^^^^|four^^^|^cuatro|', '|', '^')
 
</syntaxhighlight>
</lang>
 
=={{header|Rust}}==
<langsyntaxhighlight lang="rust">const SEPARATOR: char = '|';
const ESCAPE: char = '^';
const STRING: &str = "one^|uno||three^^^^|four^^^|^cuatro|";
Line 2,049 ⟶ 3,035:
fn main() {
println!("{:#?}", tokenize(STRING));
}</langsyntaxhighlight>
{{out}}
<pre>
Line 2,064 ⟶ 3,050:
===Old fashioned Imperative===
Imperative with removed (ugly) mutable variables.
{{Trans|Kotlin}}<langsyntaxhighlight Scalalang="scala">object TokenizeStringWithEscaping0 extends App {
 
val (markerSpE,markerSpF) = ("\ufffe" , "\uffff")
Line 2,078 ⟶ 3,064:
 
tokenize(str, "|", "^").foreach(it => println(if (it.isEmpty) "<empty token>" else it))
}</langsyntaxhighlight>
 
===Idiomatic===
====Functional with Tail recursion====
<langsyntaxhighlight Scalalang="scala">import scala.annotation.tailrec
 
object TokenizeStringWithEscaping1 extends App {
Line 2,113 ⟶ 3,099:
println(
f"[length:${it.length}%3d] ${if (it.isEmpty) "<empty token>" else it}"))
}</langsyntaxhighlight>
 
{{Out}}See it in running in your browser by [https://scalafiddle.io/sf/EsIjPQg/0 ScalaFiddle (JavaScript)] or by [https://scastie.scala-lang.org/O3DgMmuOSCS5DD6zQXK7MA Scastie (JVM)].
 
=={{header|Sidef}}==
{{trans|Perl}}
<langsyntaxhighlight lang="ruby">func tokenize(string, sep, esc) {
var fields = string.split(
Regex(esc.escape + '.(*SKIP)(*FAIL)|' + sep.escape, 's'), -1
)
fields.map{.gsub(Regex(esc.escape + '(.)'), {|s1| s1 }) }
}
 
tokenize("one^|uno||three^^^^|four^^^|^cuatro|", '|', '^').each { |str|
say str.dump
}</langsyntaxhighlight>
{{out}}
{{out}} NOTE: the output is different from the one given in the task
<pre>
"one^|uno"
""
"three^^^^"
"four^^^|^cuatro"
""
</pre>
 
=={{header|Simula}}==
<syntaxhighlight lang="simula">
SIMSET
BEGIN
 
LINK CLASS ITEM(TXT); TEXT TXT;;
 
REF(HEAD) PROCEDURE SPLIT(TXT, SEP, ESC); TEXT TXT; CHARACTER SEP, ESC;
BEGIN
REF(HEAD) PARTS;
CHARACTER CH;
TEXT PART;
 
PART :- BLANKS(TXT.LENGTH);
PARTS :- NEW HEAD;
TXT.SETPOS(1);
WHILE TXT.MORE DO BEGIN
CH := TXT.GETCHAR;
IF CH = ESC THEN BEGIN
IF TXT.MORE THEN BEGIN
CH := TXT.GETCHAR;
PART.PUTCHAR(CH);
END ELSE BEGIN
ERROR("SPLIT: ESCAPE CHAR AT END OF STRING");
END;
END ELSE IF CH = SEP THEN BEGIN
NEW ITEM(COPY(PART.SUB(1,PART.POS-1))).INTO(PARTS);
PART.SETPOS(1);
END ELSE BEGIN
PART.PUTCHAR(CH);
END;
END;
NEW ITEM(COPY(PART.SUB(1,PART.POS-1))).INTO(PARTS);
 
SPLIT :- PARTS;
END SPLIT;
 
TEXT EXAMPLE;
REF(HEAD) RESULT;
REF(ITEM) PART;
INTEGER NO;
 
FOR EXAMPLE :- "ONE^|UNO||THREE^^^^|FOUR^^^|^CUATRO|" DO
BEGIN
OUTTEXT("INPUT: '");
OUTTEXT(EXAMPLE);
OUTTEXT("'");
OUTIMAGE;
RESULT :- SPLIT(EXAMPLE, '|', '^');
PART :- RESULT.FIRST;
NO := 0;
WHILE PART =/= NONE DO
BEGIN
NO := NO + 1;
OUTTEXT("PART");
OUTINT(NO, 0);
OUTTEXT(": '");
OUTTEXT(PART.TXT);
OUTTEXT("'");
OUTIMAGE;
PART :- PART.SUC;
END;
END;
 
END.
</syntaxhighlight>
{{out}}
<pre>
INPUT: 'ONE^|UNO||THREE^^^^|FOUR^^^|^CUATRO|'
PART1: 'ONE|UNO'
PART2: ''
PART3: 'THREE^^'
PART4: 'FOUR^|CUATRO'
PART5: ''
</pre>
 
=={{header|SNOBOL4}}==
{{works with|SNOBOL4, SPITBOL for Linux}}
<syntaxhighlight lang="snobol4">
* Program: tokenize_with_escape.sbl
* To run: sbl tokenize_with_escape.sbl
* Description: Tokenize a string with escaping
* Comment: Tested using the Spitbol for Linux version of SNOBOL4
 
lf = substr(&alphabet,11,1) ;* New line or line feed
 
 
* Function tokenize will break parts out of a string, which are
* separated by c, which defaults to a comma, into
* an array. Parameter kp=1 to keep null parts, which is the default,
* and 0 to discard.
define('tokenize(s,c,kp)tokenizepat,part,t,i,j')
:(tokenize_end)
tokenize
c = (ident(c) ',', substr(c,1,1)) :f(freturn)
kp = (ident(kp) 1, eq(kp,0) 0, 1) :f(freturn)
t = table()
tokenizepat = breakx(c) . part c | (len(1) rem) . part
s ? eq(kp,1) rtab(1) c = s c
tokenize1
s ? tokenizepat = "" :f(tokenize2)
t[i = eq(kp,0) differ(part) i + 1] = part
t[i = eq(kp,1) i + 1] = part
:(tokenize1)
tokenize2
tokenize = array(i) :f(errr)
j = 0
tokenize3 tokenize[j = lt(j,i) j + 1] = t[j] :s(tokenize3)
:(return)
tokenize_end
 
 
* Function tokcan will a normalize a string by applying separator and escape
* rules to string ts. Parameter sep is the separator, while esc is the escape
* character. Parameter tesc is the new separator character to substitute for
* parameter sep. It defaults to a comma, ",".
define('tokcan(ts,sep,esc,tesc)tpat,part1,part2,notany') :(tokcan_end)
tokcan
tesc = (ident(tesc) ',', substr(tesc,1,1))
tpat = (breakx(sep esc) . part1
+ (sep | esc sep | esc esc | (esc len(1) . notany)) . part2
+ )
+ | (len(1) rem) . part1
 
tokcan1
ts ? tpat = :f(tokcan2)
part2 = (leq(part2,sep) tesc
+ ,leq(part2,esc sep) sep
+ ,leq(part2,esc esc) esc
+ ,differ(notany) leq(part2,esc notany) notany
+ )
tokcan = (ident(tokcan) "", tokcan) part1 part2
:(tokcan1)
tokcan2
:(return)
tokcan_end
 
 
test_string = "one^|uno||three^^^^|four^^^|^cuatro|"
sep = "|"
esc = "^"
 
hline = tokcan(test_string,sep,esc) :f(err)
 
 
output = " Input: " test_string lf
output = "Output1: " hline lf
 
output = "Output2: "
tokenized = tokenize(hline,",")
 
p1 output = "'" tokenized[z = z + 1] "'" :s(p1)
 
END
</syntaxhighlight>
{{out}}
<pre>
Input: one^|uno||three^^^^|four^^^|^cuatro|
 
Output1: one|uno,,three^^,four^|cuatro,
 
Output2:
'one|uno'
''
'three^^'
'four^|cuatro'
''
</pre>
 
=={{header|Swift}}==
 
{{trans|Rust}}
 
<syntaxhighlight lang="swift">extension String {
func tokenize(separator: Character, escape: Character) -> [String] {
var token = ""
var tokens = [String]()
var chars = makeIterator()
 
while let char = chars.next() {
switch char {
case separator:
tokens.append(token)
token = ""
case escape:
if let next = chars.next() {
token.append(next)
}
case _:
token.append(char)
}
}
 
tokens.append(token)
 
return tokens
}
}
 
print("one^|uno||three^^^^|four^^^|^cuatro|".tokenize(separator: "|", escape: "^"))</syntaxhighlight>
 
{{out}}
 
<pre>["one|uno", "", "three^^", "four^|cuatro", ""]</pre>
 
=={{header|Tcl}}==
Putting a coroutine in a TclOO object following the "generator pattern" gives a nice structure:
<langsyntaxhighlight Tcllang="tcl">oo::class create tokens {
constructor {s} {
puts [coroutine Next my Iter $s]
Line 2,172 ⟶ 3,364:
}
 
puts [tokenize one^|uno||three^^^^|four^^^|^cuatro| | ^]</langsyntaxhighlight>
 
{{out}}
<pre>one|uno {} three^^ four^|cuatro {}</pre>
 
=={{header|TMG}}==
Unix TMG:
<syntaxhighlight lang="unixtmg">prog: char(sep) *
char(esc) *
str: smark
token: forw/outp
( [ch==esc?] char(ch) any(!<<>>) token
| [ch==sep?] char(ch) outp str
| any(!<<>>) token );
outp: parse(( scopy = { <"> 1 <"> * } ));
forw: peek/chkeof;
peek: [ch=0] char(ch) fail;
chkeof: ( [ch?] succ | fail );
 
ch: 0;
sep: 0;
esc: 0;</syntaxhighlight>
 
Input:
<pre>|
^
one^|uno||three^^^^|four^^^|^cuatro|</pre>
 
Output:
<pre>"one|uno"
""
"three^^"
"four^|cuatro"
""</pre>
 
=={{header|VBA}}==
{{trans|Phix}}<langsyntaxhighlight lang="vb">Private Function tokenize(s As String, sep As String, esc As String) As Collection
Dim ret As New Collection
Dim this As String
Line 2,216 ⟶ 3,438:
Next i
Debug.Print Join(outstring, ", ")
End Sub</langsyntaxhighlight>{{out}}
<pre>one|uno, , three^^, four^|cuatro, </pre>
 
=={{header|V (Vlang)}}==
{{trans|Go}}
<syntaxhighlight lang="ecmascript">fn tokenize_string(s string, sep u8, escape u8) ?[]string {
mut tokens := []string{}
mut runes := []u8{}
mut in_escape := false
for r in s {
if in_escape {
in_escape = false
runes << r
} else if r == escape {
in_escape = true
} else if r == sep {
tokens << runes.bytestr()
runes = runes[..0]
} else {
runes << r
}
}
tokens << runes.bytestr()
if in_escape {
return error("invalid terminal escape")
}
return tokens
}
 
const sample = "one^|uno||three^^^^|four^^^|^cuatro|"
const separator = `|`
const escape = `^`
fn main() {
println("Input: $sample")
tokens := tokenize_string(sample, separator, escape)?
println("Tokens: $tokens")
}</syntaxhighlight>
 
{{out}}
<pre>
Input: one^|uno||three^^^^|four^^^|^cuatro|
Tokens: ['one|uno', '', 'three^^', 'four^|cuatro', '']
)
</pre>
 
=={{header|Wren}}==
{{trans|Kotlin}}
<syntaxhighlight lang="wren">var SPE = "\ufffe" // unused unicode character in Specials block
var SPF = "\uffff" // ditto
 
var tokenize = Fn.new { |str, sep, esc|
str = str.replace(esc + esc, SPE).replace(esc + sep, SPF)
str = (str[-1] == esc) ? str[0...-1].replace(esc, "") + esc : str.replace(esc, "")
return str.split(sep).map { |s| s.replace(SPE, esc).replace(SPF, sep) }.toList
}
 
var str = "one^|uno||three^^^^|four^^^|^cuatro|"
var sep = "|"
var esc = "^"
var items = tokenize.call(str, sep, esc)
for (item in items) System.print((item == "") ? "(empty)" : item)</syntaxhighlight>
 
{{out}}
<pre>
one|uno
(empty)
three^^
four^|cuatro
(empty)
</pre>
 
=={{header|zkl}}==
Two simplifying assumptions (since their behavior is undefined): A string ending with an un-escaped escape is an error and 0xff is not an allowed character in the string.
<langsyntaxhighlight lang="zkl">fcn tokenize(str,sep,esc){
sink:=Sink(String);
foreach c in (str){
Line 2,231 ⟶ 3,521:
}
sink.close().split("\xff");
}</langsyntaxhighlight>
Or, if you prefer brevity:
<langsyntaxhighlight lang="zkl">fcn tokenize(str,sep,esc){
sink:=Sink(String);
foreach c in (str){ sink.write( (c==esc and __cWalker.next()) or (c==sep and "\xff") or c ) }
sink.close().split("\xff");
}</langsyntaxhighlight>
<langsyntaxhighlight lang="zkl">tokenize("one^|uno||three^^^^|four^^^|^cuatro|", "|","^").println();</langsyntaxhighlight>
{{out}}
<pre>L("one|uno","","three^^","four^|cuatro","")</pre>
9,476

edits