Tokenize a string with escaping: Difference between revisions

m
→‎{{header|Wren}}: Changed to Wren S/H
m (→‎Regex-based: ordering of regexes is not important (at least in this case))
m (→‎{{header|Wren}}: Changed to Wren S/H)
 
(10 intermediate revisions by 8 users not shown)
Line 67:
{{trans|Python}}
 
<langsyntaxhighlight lang="11l">F token_with_escape(a, escape = ‘^’, separator = ‘|’)
[String] result
V token = ‘’
Line 86:
R result
 
print(token_with_escape(‘one^|uno||three^^^^|four^^^|^cuatro|’).map(s -> ‘'’s‘'’).join(‘, ’))</langsyntaxhighlight>
 
{{out}}
Line 95:
=={{header|8080 Assembly}}==
 
<langsyntaxhighlight lang="8080asm"> org 100h
jmp demo
;;; Routine to split a 0-terminated string
Line 168:
pfx: db '> $' ; Prefix to make the output more obvious
nl: db 13,10,'$'
test: db 'one^|uno||three^^^^|four^^^|^cuatro|',0</langsyntaxhighlight>
 
{{out}}
Line 180:
 
=={{header|Action!}}==
<langsyntaxhighlight Actionlang="action!">DEFINE PTR="CARD"
 
TYPE Tokens=[
Line 258:
Tokenize("one^|uno||three^^^^|four^^^|^cuatro|",'|,'^,t)
PrintTokens(t)
RETURN</langsyntaxhighlight>
{{out}}
[https://gitlab.com/amarok8bit/action-rosetta-code/-/raw/master/images/Tokenize_a_string_with_escaping.png Screenshot from Atari 8-bit computer]
Line 270:
 
=={{header|Ada}}==
<langsyntaxhighlight Adalang="ada">with Ada.Text_Io;
with Ada.Containers.Indefinite_Vectors;
with Ada.Strings.Unbounded;
Line 326:
begin
Put_Vector (Split ("one^|uno||three^^^^|four^^^|^cuatro|"));
end Tokenize;</langsyntaxhighlight>
{{out}}
<pre>'one|uno'
Line 335:
 
=={{header|ALGOL 68}}==
<langsyntaxhighlight lang="algol68">BEGIN
# returns s parsed according to delimiter and escape #
PROC parse with escapes = ( STRING s, CHAR delimiter, escape )[]STRING:
Line 372:
[]STRING tokens = parse with escapes( "one^|uno||three^^^^|four^^^|^cuatro|", "|", "^" );
FOR t pos FROM LWB tokens TO UPB tokens DO print( ( "[", tokens[ t pos ], "]", newline ) ) OD
END</langsyntaxhighlight>
{{out}}
<pre>
Line 385:
 
{{Trans|JavaScript}}
<langsyntaxhighlight AppleScriptlang="applescript">------------------ TOKENIZE WITH ESCAPING ----------------
 
-- tokenize :: String -> Character -> Character -> [String]
Line 475:
g
end if
end cond</langsyntaxhighlight>
{{Out}}
<pre>1: one|uno
Line 484:
 
=={{header|Arturo}}==
<langsyntaxhighlight lang="rebol">tokenize: function [s sep esc][
escaping: 0
 
Line 505:
 
str: "one^|uno||three^^^^|four^^^|^cuatro|"
tokenize str "|" "^"</langsyntaxhighlight>
 
{{out}}
Line 515:
 
=={{header|AutoHotkey}}==
<langsyntaxhighlight AutoHotkeylang="autohotkey">Tokenize(s,d,e){
for i,v in x:=StrSplit(StrReplace(StrReplace(StrReplace(s,e e,Chr(0xFFFE)),e d,Chr(0xFFFF)),e),d)
x[i]:=StrReplace(StrReplace(v,Chr(0xFFFE),e),Chr(0xFFFF),d)
return x
}</langsyntaxhighlight>
Examples:<langsyntaxhighlight AutoHotkeylang="autohotkey">str := "one^|uno||three^^^^|four^^^|^cuatro|"
for i, v in Tokenize(str, "|", "^")
output .= i " : " v "`n"
MsgBox % output</langsyntaxhighlight>
{{out}}
<pre>1 : one|uno
Line 532:
 
=={{header|BBC BASIC}}==
<langsyntaxhighlight lang="bbcbasic">REM >tokenizer
PROC_tokenize("one^|uno||three^^^^|four^^^|^cuatro|", "|", "^")
END
Line 560:
NEXT
PRINT
ENDPROC</langsyntaxhighlight>
{{out}}
<pre> 1 one|uno
Line 567:
4 four^|cuatro
5 </pre>
 
=={{header|BQN}}==
 
<syntaxhighlight lang="bqn">str ← "one^|uno||three^^^^|four^^^|^cuatro|"
Split ← ((⊢-˜+`׬)∘=⊔⊢)
SplitE ← {
esc ← <`'^'=𝕩
rem ← »esc
spl ← (¬rem)∧'|'=𝕩
𝕩⊔˜(⊢-(esc∨spl)×1⊸+)+`spl
}
 
•Show SplitE str</syntaxhighlight>
<syntaxhighlight lang="text">⟨ "one|uno" ⟨⟩ "three^^" "four^|cuatro" ⟩</syntaxhighlight>
 
=={{header|C}}==
{{works with|C}}
 
<langsyntaxhighlight lang="c">#include <stdlib.h>
#include <stdio.h>
 
Line 671 ⟶ 685:
 
return list;
}</langsyntaxhighlight>
 
{{Out}}
Line 687 ⟶ 701:
 
=={{header|C sharp}}==
<langsyntaxhighlight lang="csharp">using System;
using System.Text;
using System.Collections.Generic;
Line 727 ⟶ 741:
return result;
}
}</langsyntaxhighlight>
{{out}}
<pre>
Line 737 ⟶ 751:
 
=={{header|C++}}==
<langsyntaxhighlight lang="cpp">#include <iostream>
#include <stdexcept>
#include <string>
Line 780 ⟶ 794:
 
return 0;
}</langsyntaxhighlight>
{{out}}
<pre>one^|uno||three^^^^|four^^^|^cuatro|
Line 786 ⟶ 800:
 
=={{header|CLU}}==
<langsyntaxhighlight lang="clu">tokenize = iter (sep, esc: char, s: string) yields (string)
escape: bool := false
part: array[char] := array[char]$[]
Line 812 ⟶ 826:
stream$putl(po, "\"" || part || "\"")
end
end start_up</langsyntaxhighlight>
{{out}}
<pre>"one|uno"
Line 821 ⟶ 835:
 
=={{header|COBOL}}==
<langsyntaxhighlight lang="cobol"> >>SOURCE FORMAT FREE
identification division.
program-id. 'tokenizewithescaping'.
Line 956 ⟶ 970:
.
end program 'tokenizewithescaping'.
</syntaxhighlight>
</lang>
 
{{out}}
Line 1,008 ⟶ 1,022:
 
=={{header|Common Lisp}}==
<langsyntaxhighlight lang="lisp">(defun split (input separator escape)
(flet ((make-string-buffer ()
(make-array 0 :element-type 'character :adjustable t :fill-pointer t)))
Line 1,030 ⟶ 1,044:
(defun main ()
(dolist (token (split "one^|uno||three^^^^|four^^^|^cuatro|" #\| #\^))
(format t "'~A'~%" token)))</langsyntaxhighlight>
{{out}}
<pre>'one|uno'
Line 1,040 ⟶ 1,054:
=={{header|D}}==
{{trans|Java}}
<langsyntaxhighlight Dlang="d">import std.stdio;
 
void main() {
Line 1,074 ⟶ 1,088:
output.put(token.data.idup);
return output.data;
}</langsyntaxhighlight>
 
{{out}}
Line 1,083 ⟶ 1,097:
{{trans|C#}}
 
<langsyntaxhighlight lang="dyalect">func String.tokenizeTokenize(separator, escape) {
var buffer = []
var escaping = false
for c in this {
if escaping {
buffer.addAdd(c)
escaping = false
} else if c == escape {
escaping = true
} else if c == separator {
yield buffer.flushFlush();
} else {
buffer.addAdd(c);
}
}
 
if buffer.lenLength() > 0 || this[this.lenLength() - 1] == separator {
yield buffer.flushFlush()
}
}
func Array.flushFlush() {
var str = String.concatConcat(values: this)
this.clearClear()
str
}
 
let testcase = "one^|uno||three^^^^|four^^^|^cuatro|";
for token in testcase.tokenizeTokenize(separator: '|', escape: '^') {
print(": \(token)")
}</langsyntaxhighlight>
 
{{out}}
Line 1,125 ⟶ 1,139:
=={{header|Elena}}==
{{trans|C#}}
ELENA 46.x :
<langsyntaxhighlight lang="elena">import extensions;
import extensions'routines;
import system'collections;
Line 1,140 ⟶ 1,154:
bool escaping := false;
self.forEach::(ch)
{
if (escaping)
{
buffer.write:(ch);
escaping := false
}
Line 1,158 ⟶ 1,172:
else
{
buffer.write:(ch)
}
};
Line 1,170 ⟶ 1,184:
public program()
{
testcase.tokenize("|", "^").forEach:(printingLn)
}</langsyntaxhighlight>
{{out}}
<pre>
Line 1,181 ⟶ 1,195:
 
=={{header|F_Sharp|F#}}==
<langsyntaxhighlight lang="fsharp">open System
open System.Text.RegularExpressions
 
Line 1,208 ⟶ 1,222:
|> Seq.map (unescape esc)
|> Seq.iter (fun s -> printfn "'%s'" s)
0</langsyntaxhighlight>
{{out}}
<pre>'one|uno'
Line 1,219 ⟶ 1,233:
This example uses Factor's <code>parser-combinators</code> vocabulary, which is modeled after Haskell's parser combinators. Page <tt>51</tt> of [https://bluishcoder.co.nz/factor-articles.pdf this pdf] contains a useful introduction to this vocabulary.
{{works with|Factor|0.99 2019-10-06}}
<langsyntaxhighlight lang="factor">USING: accessors kernel lists literals namespaces
parser-combinators prettyprint sequences strings ;
 
Line 1,241 ⟶ 1,255:
 
"one^|uno||three^^^^|four^^^|^cuatro|"
CHAR: | CHAR: ^ tokenize .</langsyntaxhighlight>
{{out}}
<pre>
Line 1,248 ⟶ 1,262:
 
=={{header|Forth}}==
<langsyntaxhighlight lang="forth">variable 'src
variable #src
variable offset
Line 1,264 ⟶ 1,278:
page
cr ." #### start ####" cr tokenize cr ." #### End ####" cr
</syntaxhighlight>
</lang>
{{output}}
<pre>
Line 1,280 ⟶ 1,294:
First Fortran (1958) offered no facilities for inspecting or manipulating text, until Fortran IV when the <code>A</code> format code was introduced whereby text could be read or written from numeric variables. The difficulties and incompatibilities between different computers were eased with F77 that offered CHARACTER*n variables, though they are not quite strings that have a varying length. F95 introduces the ability to define a compound entity such as a string and F2003 standardised a version of strings whereby with each assignment to such a variable, it would be re-allocated with the required amount of storage. Otherwise, one proceeds with CHARACTER variables and an associated variable containing its current length as with <code>TOKEN</code> and <code>L</code>. However, when passed to subroutines (or functions) as a parameter, a CHARACTER variable is supplied along with a secret additional parameter giving the size of the variable, and this is stringlike, so long as there is no need to change the length. Thus, the length of parameter TEXT to subroutine SPLIT can be found via LEN(TEXT).
 
The source style is F90 simply for the convenience of having subroutine SPLOT defined within subroutine SPLIT so as to gain access to certain variables. If separate subroutines were to be used, then there would have to be parameters or COMMON variables, or, one could just replicate the code within SPLIT. A further F90 feature involves declaring the size of internal variable <code>TOKEN</code> to be <code>LEN(TEXT)</code>, which is surely the largest it could be. Otherwise, one would have to select some "surely big enough" value.<langsyntaxhighlight Fortranlang="fortran"> SUBROUTINE SPLIT(TEXT,SEP,ESC) !Identifies and prints tokens from within a text.
CHARACTER*(*) TEXT !To be scanned.
CHARACTER*(1) SEP !The only separator for tokens.
Line 1,323 ⟶ 1,337:
CALL SPLIT("one^|uno||three^^^^|four^^^|^cuatro|","|","^")
 
END</langsyntaxhighlight>
 
The output has the text of the tokens marked >thus<
Line 1,342 ⟶ 1,356:
=={{header|FreeBASIC}}==
{{trans|Ring}}
<langsyntaxhighlight lang="freebasic">Sub tokenize(cadena As String, separador As String, escape As String)
Dim As Integer campo = 1
Dim As Boolean escapando = false
Line 1,370 ⟶ 1,384:
 
tokenize("one^|uno||three^^^^|four^^^|^cuatro|", "|", "^")
Sleep</langsyntaxhighlight>
{{out}}
<pre>
Line 1,378 ⟶ 1,392:
 
=={{header|Go}}==
<langsyntaxhighlight lang="go">package main
 
import (
Line 1,421 ⟶ 1,435:
fmt.Printf("Tokens: %q\n", tokens)
}
}</langsyntaxhighlight>
{{out}}
<pre>
Line 1,432 ⟶ 1,446:
=== Deterministic Finite Automaton ===
 
<langsyntaxhighlight lang="haskell">splitEsc :: (Foldable t1, Eq t) => t -> t -> t1 t -> [[t]]
splitEsc sep esc = reverse . map reverse . snd . foldl process (0, [[]])
where process (st, r:rs) ch
Line 1,438 ⟶ 1,452:
| st == 0 && ch == sep = (0, []:r:rs)
| st == 1 && sep == esc && ch /= sep = (0, [ch]:r:rs)
| otherwise = (0, (ch:r):rs)</langsyntaxhighlight>
 
{{out}}
Line 1,458 ⟶ 1,472:
Constant in space (~ O(k), where k -- is token length), as fast as DFA-based solution.
 
<langsyntaxhighlight lang="haskell">{-#Language LambdaCase #-}
import Conduit
 
Line 1,468 ⟶ 1,482:
Just ch | notEsc && ch == esc -> go False b
| notEsc && ch == sep -> yield b >> go True []
| otherwise -> go True (ch:b)</langsyntaxhighlight>
 
This new conduit could be used in a pipeline as follows:
 
<langsyntaxhighlight lang="haskell">main = runConduit $
yieldMany "one^|uno||three^^^^|four^^^|^cuatro|"
.| splitEscC '|' '^'
.| mapM_C print</langsyntaxhighlight>
 
<pre>λ> main
Line 1,486 ⟶ 1,500:
===Alternative===
This is essentially equivalent to the first (DFA) example, but, though possibly less elegant than the guard idiom, appears to be fractionally faster with larger (eg 180k) test strings.
<langsyntaxhighlight lang="haskell">import Data.Bool (bool)
 
------------------ TOKENIZE WITH ESCAPING ----------------
Line 1,518 ⟶ 1,532:
'|'
'^'
"one^|uno||three^^^^|four^^^|^cuatro|"</langsyntaxhighlight>
{{Out}}
<pre>"one|uno"
Line 1,529 ⟶ 1,543:
 
From the python example:
<syntaxhighlight lang="j">
<lang J>
tokenize1=: tokenize =: '^|'&$: :(4 : 0)
'ESC SEP' =. x
Line 1,552 ⟶ 1,566:
RESULT =. RESULT , < TOKEN
)
</syntaxhighlight>
</lang>
<pre>
tokenize 'one^|uno||three^^^^|four^^^|^cuatro|'
Line 1,563 ⟶ 1,577:
Here's a somewhat more efficient approach (over 100 times faster on a 100k textual example):
 
<langsyntaxhighlight Jlang="j">tokenize2=: tokenize=:3 :0
'^|' tokenize2 y NB. task default escape and separator
:
Line 1,572 ⟶ 1,586:
T=. (#y){. 1,}.S NB. token beginnings
(T<;.1 K)#&.>T<;.1 y
)</langsyntaxhighlight>
 
Example use:
 
<langsyntaxhighlight Jlang="j"> '^|' tokenize 'one^|uno||three^^^^|four^^^|^cuatro|'
┌───────┬┬───────┬────────────┬┐
│one|uno││three^^│four^|cuatro││
└───────┴┴───────┴────────────┴┘</langsyntaxhighlight>
 
 
Solution invoking the sequential machine primitive verb.[[http://jsoftware.com/pipermail/programming/2014-December/040658.html|See this thread.]]<langsyntaxhighlight Jlang="j">charTokens =: (0;(3 2 2$(2 1 1 1 2 2 1 2 1 0 1 0));<<'^')&;: NB. sequential machine
splitTokens =: ((<,'|')&= <;._1 ])@:((<,'|'),])
removeExtra =: (}.^:(1<#)) L:0
tokenize3=: tokenize=: ; each @: (removeExtra @: splitTokens @: charTokens)</langsyntaxhighlight>Example use:<syntaxhighlight lang J="j"> t=: 'one^|uno||three^^^^|four^^^|^cuatro|'
 
tokenize t
Line 1,593 ⟶ 1,607:
 
$tokenize t
5</langsyntaxhighlight>
 
Relative efficiencies:
 
<langsyntaxhighlight Jlang="j"> txt=: 1e5$'one^|uno||three^^^^|four^^^|^cuatro|'
(%"1 <./) timespacex every 'tokenize1 txt';'tokenize2 txt';'tokenize3 txt'
132.856 1
1 7.73534
8.29568 19.9766</langsyntaxhighlight>
 
So tokenize2 is the fastest, while tokenize1 uses the least amount of memory. Also, tokenize1 is the slowest and tokenize3 uses the most memory. (First column is relative time used, second is relative space used, rows correspond to implementations.)
Line 1,609 ⟶ 1,623:
{{trans|Go}}
{{works with|Java|7}}
<langsyntaxhighlight lang="java">import java.util.*;
 
public class TokenizeStringWithEscaping {
Line 1,652 ⟶ 1,666:
return tokens;
}
}</langsyntaxhighlight>
 
<pre>[one|uno, , three^^, four^|cuatro, ]</pre>
Line 1,659 ⟶ 1,673:
===ES5===
====Iterative====
<langsyntaxhighlight JavaScriptlang="javascript">function tokenize(s, esc, sep) {
for (var a=[], t='', i=0, e=s.length; i<e; i+=1) {
var c = s.charAt(i)
Line 1,672 ⟶ 1,686:
var s = 'one^|uno||three^^^^|four^^^|^cuatro|'
document.write(s, '<br>')
for (var a=tokenize(s,'^','|'), i=0; i<a.length; i+=1) document.write(i, ': ', a[i], '<br>')</langsyntaxhighlight>
{{out}}
<pre>one^|uno||three^^^^|four^^^|^cuatro|
Line 1,683 ⟶ 1,697:
 
====Functional====
<langsyntaxhighlight JavaScriptlang="javascript">(function () {
'use strict';
 
Line 1,719 ⟶ 1,733:
.join('\n');
 
})();</langsyntaxhighlight>
{{Out}}
<pre>one|uno
Line 1,733 ⟶ 1,747:
 
{{Trans|Haskell}} (Single fold version)
<langsyntaxhighlight JavaScriptlang="javascript">((() => {
 
// tokenize :: String -> Character -> Character -> [String]
Line 1,768 ⟶ 1,782:
.map(show)
.join('\n');
}))();</langsyntaxhighlight>
 
{{Out}}
Line 1,780 ⟶ 1,794:
Defining the function as a composition of generics from a parser combinator library:
 
<langsyntaxhighlight lang="javascript">(() => {
'use strict';
 
Line 2,199 ⟶ 2,213:
// MAIN ---
return main();
})();</langsyntaxhighlight>
{{Out}}
<pre>[
Line 2,212 ⟶ 2,226:
=={{header|jq}}==
{{works with| jq|1.5}}
<langsyntaxhighlight lang="jq"># Tokenize the input using the string "escape" as the prefix escape string
def tokenize(separator; escape):
 
Line 2,247 ⟶ 2,261:
| map( if type == "string" then split(escape) else . end)
| flatten
| reform ;</langsyntaxhighlight>
 
'''Example:'''
<langsyntaxhighlight lang="jq">"one^|uno||three^^^^|four^^^|^cuatro|" | tokenize("|"; "^")</langsyntaxhighlight>
 
{{out}}
<langsyntaxhighlight lang="sh">$ jq -n -f tokenize.jq
[
"one|uno",
Line 2,260 ⟶ 2,274:
"four^|cuatro",
""
]</langsyntaxhighlight>
 
=={{header|Julia}}==
Line 2,266 ⟶ 2,280:
{{trans|Kotlin}}
 
<langsyntaxhighlight lang="julia">function tokenize2(s::AbstractString, sep::Char, esc::Char)
SPE = "\ufffe"
SPF = "\uffff"
Line 2,278 ⟶ 2,292:
end
 
@show tokenize2("one^|uno||three^^^^|four^^^|^cuatro|", '|', '^')</langsyntaxhighlight>
 
{{out}}
Line 2,284 ⟶ 2,298:
 
=={{header|Kotlin}}==
<langsyntaxhighlight lang="scala">// version 1.1.3
 
const val SPE = "\ufffe" // unused unicode char in Specials block
Line 2,304 ⟶ 2,318:
val items = tokenize(str, sep, esc)
for (item in items) println(if (item.isEmpty()) "(empty)" else item)
}</langsyntaxhighlight>
 
{{out}}
Line 2,316 ⟶ 2,330:
 
=={{header|Lingo}}==
<langsyntaxhighlight lang="lingo">-- in some movie script
 
on tokenize (str, sep, esc)
Line 2,358 ⟶ 2,372:
end repeat
return str
end</langsyntaxhighlight>
 
<langsyntaxhighlight lang="lingo">str = "one^|uno||three^^^^|four^^^|^cuatro|"
sep = "|"
esc = "^"
put tokenize(str, sep, esc)
-- ["one|uno", "", "three^^", "four^|cuatro", ""]</langsyntaxhighlight>
 
=={{header|Lua}}==
<langsyntaxhighlight Lualang="lua">function tokenise (str, sep, esc)
local strList, word, escaped, ch = {}, "", false
for pos = 1, #str do
Line 2,399 ⟶ 2,413:
for k, v in pairs(tokenise(testStr, testSep, testEsc)) do
print(k, v)
end</langsyntaxhighlight>
{{out}}
<pre>1 one|uno
Line 2,408 ⟶ 2,422:
 
=={{header|Mathematica}} / {{header|Wolfram Language}}==
<langsyntaxhighlight Mathematicalang="mathematica">ClearAll[Tokenize]
Tokenize[str_String, escape_String : "^", sep_String : "|"] :=
Module[{results = {}, token = "", state = 0, a},
Line 2,438 ⟶ 2,452:
results
]
Tokenize["one^|uno||three^^^^|four^^^|^cuatro|"]</langsyntaxhighlight>
{{out}}
<pre>{"one|uno", "", "three^^", "four^|cuatro", ""}</pre>
 
=={{header|Nim}}==
<langsyntaxhighlight lang="nim">import streams
 
proc tokenize(s: Stream, sep: static[char] = '|', esc: static[char] = '^'): seq[string] =
Line 2,461 ⟶ 2,475:
for i, s in tokenize(newStringStream "one^|uno||three^^^^|four^^^|^cuatro|"):
echo i, ":", s
</syntaxhighlight>
</lang>
{{out}}
<pre>0:one|uno
Line 2,472 ⟶ 2,486:
=={{header|OCaml}}==
 
<langsyntaxhighlight lang="ocaml">let split_with_escaping ~esc ~sep s =
let len = String.length s in
let buf = Buffer.create 16 in
Line 2,489 ⟶ 2,503:
end
in
loop 0</langsyntaxhighlight>
 
Example:
<langsyntaxhighlight lang="ocaml">let res = split_with_escaping ~esc:'^' ~sep:'|' "one^|uno||three^^^^|four^^^|^cuatro|";;
val res : string list = ["one|uno"; ""; "three^^"; "four^|cuatro"; ""]</langsyntaxhighlight>
 
=={{header|Perl}}==
Line 2,500 ⟶ 2,514:
The built-in <code>split</code> function can be used with a regex that matches the delimiter ''(although [http://perldoc.perl.org/perlre.html#Special-Backtracking-Control-Verbs advanced backtracking control verbs] are needed to skip escaped delimiters)'':
 
<langsyntaxhighlight lang="perl">sub tokenize {
my ($string, $sep, $esc) = (shift, quotemeta shift, quotemeta shift);
my @fields = split /$esc . (*SKIP)(*FAIL) | $sep/sx, $string, -1;
return map { s/$esc(.)/$1/gsr } @fields;
}</langsyntaxhighlight>
 
A more traditional approach is to parse the input string step by step ''(using a repeatedly-matching regex of the form [http://perldoc.perl.org/perlretut.html#Global-matching <code>/\G.../g</code>])'', and throw away the separators ''(which can be done implicitly using [http://perldoc.perl.org/perlre.html#%28?%3C=pattern%29-\K \K])'':
 
<langsyntaxhighlight lang="perl"> my @fields = $string =~ /\G (?:^ | $sep) \K (?: [^$sep$esc] | $esc .)*/gsx;</langsyntaxhighlight>
 
In both cases, stripping the escape characters happens as a separate step.
Line 2,515 ⟶ 2,529:
Testing:
 
<langsyntaxhighlight lang="perl">print "'$_'\n" for tokenize("one^|uno||three^^^^|four^^^|^cuatro|", '|', '^');</langsyntaxhighlight>
 
{{out}}
Line 2,527 ⟶ 2,541:
 
=={{header|Phix}}==
<!--<langsyntaxhighlight Phixlang="phix">(phixonline)-->
<span style="color: #008080;">function</span> <span style="color: #000000;">tokenize</span><span style="color: #0000FF;">(</span><span style="color: #004080;">string</span> <span style="color: #000000;">s</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">sep</span><span style="color: #0000FF;">,</span> <span style="color: #004080;">integer</span> <span style="color: #000000;">esc</span><span style="color: #0000FF;">)</span>
<span style="color: #004080;">sequence</span> <span style="color: #000000;">ret</span> <span style="color: #0000FF;">=</span> <span style="color: #0000FF;">{}</span>
Line 2,554 ⟶ 2,568:
<span style="color: #0000FF;">?</span><span style="color: #000000;">tokenize</span><span style="color: #0000FF;">(</span><span style="color: #008000;">"one^|uno||three^^^^|four^^^|^cuatro|"</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'|'</span><span style="color: #0000FF;">,</span><span style="color: #008000;">'^'</span><span style="color: #0000FF;">)</span>
<!--</langsyntaxhighlight>-->
{{Out}}
<pre>
Line 2,561 ⟶ 2,575:
 
=={{header|PicoLisp}}==
<langsyntaxhighlight PicoLisplang="picolisp">(de tokenize (Str Sep Esc)
(split
(make
Line 2,570 ⟶ 2,584:
((= C Sep) (link 0))
(T (link C)) ) ) ) )
0 ) )</langsyntaxhighlight>
Test:
<langsyntaxhighlight PicoLisplang="picolisp">(for (I . S) (tokenize "one\^|uno||three\^\^\^\^|four\^\^\^|\^cuatro|" "|" "\^")
(prinl I ": " S) )</langsyntaxhighlight>
Output:
<pre>1: one|uno
Line 2,582 ⟶ 2,596:
 
=={{header|PowerShell}}==
<syntaxhighlight lang="powershell">
<lang PowerShell>
function Split-String ([string]$String, [char]$Separator, [char]$Escape)
{
Line 2,612 ⟶ 2,626:
if ($String[-1] -eq $Separator) {[String]::Empty}
}
</syntaxhighlight>
</lang>
<syntaxhighlight lang="powershell">
<lang PowerShell>
Split-String "one^|uno||three^^^^|four^^^|^cuatro|" -Separator "|" -Escape "^" | ForEach-Object `
-Begin {$n = 0} `
-Process {$n+= 1; "{0}: {1}" -f $n, $_}
</syntaxhighlight>
</lang>
{{Out}}
<pre>
Line 2,629 ⟶ 2,643:
=={{header|Python}}==
===Procedural===
<langsyntaxhighlight lang="python">def token_with_escape(a, escape = '^', separator = '|'):
'''
Issue python -m doctest thisfile.py to run the doctests.
Line 2,652 ⟶ 2,666:
state = 0
result.append(token)
return result</langsyntaxhighlight>
 
===Functional===
{{Works with|Python|3}}
<langsyntaxhighlight lang="python">'''Tokenize a string with escaping'''
 
from functools import reduce
Line 2,697 ⟶ 2,711:
# MAIN ---
if __name__ == '__main__':
main()</langsyntaxhighlight>
{{Out}}
<pre>['one|uno', '', 'three^^', 'four^|cuatro', '']</pre>
 
===Regex-based===
 
====Using <code>Scanner</code>====
 
The python <code>re</code> library has a handy class <code>Scanner</code> which is intended precisely for this use-case.
Line 2,708 ⟶ 2,724:
 
The following code also illustrates an important feature of Python ‒ nested functions with closures.
Owing to this feature, the inner functions, such as <code>start_new_token</code>, are able to access the local variable <code>tokens</code> of their enclosing function <code>tokenize</code>. For the inner function, the name <code>tokens</code> is ''nonlocal'', and is in the ''enclosing scope'' of the inner function (as opposed to the parameters <code>scanner</code> and <code>substring</code>, which are in the local scope).
For the inner function, the name <code>tokens</code> is ''nonlocal'', and is in the ''enclosing scope'' of the inner function (as opposed to the parameters <code>scanner</code> and <code>substring</code>, which are in the local scope).
 
<langsyntaxhighlight lang="python">import re
 
STRING = 'one^|uno||three^^^^|four^^^|^cuatro|'
Line 2,747 ⟶ 2,764:
 
if __name__ == '__main__':
print(list(tokenize()))</syntaxhighlight>
</lang>
 
Output is the same as in the functional Python version above.
 
 
====Simpler version with preprocessing====
 
This version does not require any extra state, such as the <code>token</code> list in the Scanner-based version above.
It first preprocesses the input, since Python does not support variable-length lookbehind assertions.
Then it works only with the primitive regex operations <code>re.findall</code> and <code>re.sub</code>.
Note that the regex used here is compiled with the <code>re.VERBOSE</code> flag.
This allows us to write the regex on several lines (since unescaped whitespace is ignored in this mode), and use comments inside the regex (starting with <code>#</code>).
 
<syntaxhighlight lang="python">import re
 
STRING = 'one^|uno||three^^^^|four^^^|^cuatro|'
 
def tokenize(string=STRING, escape='^', separator='|'):
 
re_escape, re_separator = map(re.escape, (escape, separator))
 
# token regex
regex = re.compile(fr'''
# lookbehind: a token must be preceded by a separator
# (note that `(?<=^|{re_separator})` doesn't work in Python)
(?<={re_separator})
 
# a token consists either of an escape sequence,
# or a regular (non-escape, non-separator) character,
# repeated arbitrarily many times (even zero)
(?:{re_escape}.|[^{re_escape}{re_separator}])*
''',
flags=re.VERBOSE
)
 
# since each token must start with a separator,
# we must add an extra separator at the beginning of input
preprocessed_string = separator + string
 
for almost_token in regex.findall(preprocessed_string):
# now get rid of escape characters: '^^' -> '^' etc.
token = re.sub(fr'{re_escape}(.)', r'\1', almost_token)
yield token
 
if __name__ == '__main__':
print(list(tokenize()))</syntaxhighlight>
 
=={{header|Racket}}==
<langsyntaxhighlight lang="racket">#lang racket/base
(require racket/match)
 
Line 2,777 ⟶ 2,836:
(report-input-output "|")
(report-input-output "^")
(report-input-output ".")</langsyntaxhighlight>
 
{{out}}
Line 2,798 ⟶ 2,857:
(formerly Perl 6)
 
<syntaxhighlight lang="raku" perl6line>sub tokenize ($string, :$sep!, :$esc!) {
return $string.match(/([ <!before $sep | $esc> . | $esc . ]*)+ % $sep/)\
.[0].map(*.subst: /$esc )> ./, '', :g);
}
 
say "'$_'" for tokenize 'one^|uno||three^^^^|four^^^|^cuatro|', sep => '|', esc => '^';</langsyntaxhighlight>
 
{{out}}
Line 2,823 ⟶ 2,882:
=={{header|REXX}}==
===IF/THEN logic===
<langsyntaxhighlight lang="rexx">/*REXX program demonstrates tokenizing and displaying a string with escaping sequences. */
str = 'one^|uno||three^^^^|four^^^|^cuatro|' /*the character string to be tokenized.*/
esc = '^' /* " escape character to be used. */
Line 2,840 ⟶ 2,899:
exit /*stick a fork in it, we're all done. */
/*──────────────────────────────────────────────────────────────────────────────────────*/
show: say '[length'right(length(out),4)"]" out; out=; return</langsyntaxhighlight>
'''output'''
<pre>
Line 2,852 ⟶ 2,911:
===SELECT logic===
This REXX version also shows a scale in the output.
<langsyntaxhighlight lang="rexx">/*REXX program demonstrates tokenizing and displaying a string with escaping sequences. */
str = 'one^|uno||three^^^^|four^^^|^cuatro|' /*the character string to be tokenized.*/
esc = '^' /* " escape character to be used. */
Line 2,876 ⟶ 2,935:
exit /*stick a fork in it, we're all done. */
/*──────────────────────────────────────────────────────────────────────────────────────*/
show: say '[length'right(length($),4)"]" $; $=; return</langsyntaxhighlight>
'''output'''
<pre>
Line 2,892 ⟶ 2,951:
 
=={{header|Ring}}==
<langsyntaxhighlight lang="ring">
tokenize("one^|uno||three^^^^|four^^^|^cuatro|", "|", "^")
 
Line 2,918 ⟶ 2,977:
next
see nl
</syntaxhighlight>
</lang>
Output:
<pre>
Line 2,934 ⟶ 2,993:
{{trans|Perl}}
 
<langsyntaxhighlight lang="ruby">
def tokenize(string, sep, esc)
sep = Regexp.escape(sep)
Line 2,945 ⟶ 3,004:
p tokenize('one^|uno||three^^^^|four^^^|^cuatro|', '|', '^')
 
</syntaxhighlight>
</lang>
 
=={{header|Rust}}==
<langsyntaxhighlight lang="rust">const SEPARATOR: char = '|';
const ESCAPE: char = '^';
const STRING: &str = "one^|uno||three^^^^|four^^^|^cuatro|";
Line 2,976 ⟶ 3,035:
fn main() {
println!("{:#?}", tokenize(STRING));
}</langsyntaxhighlight>
{{out}}
<pre>
Line 2,991 ⟶ 3,050:
===Old fashioned Imperative===
Imperative with removed (ugly) mutable variables.
{{Trans|Kotlin}}<langsyntaxhighlight Scalalang="scala">object TokenizeStringWithEscaping0 extends App {
 
val (markerSpE,markerSpF) = ("\ufffe" , "\uffff")
Line 3,005 ⟶ 3,064:
 
tokenize(str, "|", "^").foreach(it => println(if (it.isEmpty) "<empty token>" else it))
}</langsyntaxhighlight>
 
===Idiomatic===
====Functional with Tail recursion====
<langsyntaxhighlight Scalalang="scala">import scala.annotation.tailrec
 
object TokenizeStringWithEscaping1 extends App {
Line 3,040 ⟶ 3,099:
println(
f"[length:${it.length}%3d] ${if (it.isEmpty) "<empty token>" else it}"))
}</langsyntaxhighlight>
 
{{Out}}See it in running in your browser by [https://scalafiddle.io/sf/EsIjPQg/0 ScalaFiddle (JavaScript)] or by [https://scastie.scala-lang.org/O3DgMmuOSCS5DD6zQXK7MA Scastie (JVM)].
Line 3,046 ⟶ 3,105:
=={{header|Sidef}}==
{{trans|Perl}}
<langsyntaxhighlight lang="ruby">func tokenize(string, sep, esc) {
var fields = string.split(
Regex(esc.escape + '.(*SKIP)(*FAIL)|' + sep.escape, 's'), -1
Line 3,055 ⟶ 3,114:
tokenize("one^|uno||three^^^^|four^^^|^cuatro|", '|', '^').each { |str|
say str.dump
}</langsyntaxhighlight>
{{out}}
<pre>
Line 3,066 ⟶ 3,125:
 
=={{header|Simula}}==
<langsyntaxhighlight lang="simula">
SIMSET
BEGIN
Line 3,130 ⟶ 3,189:
 
END.
</syntaxhighlight>
</lang>
{{out}}
<pre>
Line 3,139 ⟶ 3,198:
PART4: 'FOUR^|CUATRO'
PART5: ''
</pre>
 
=={{header|SNOBOL4}}==
{{works with|SNOBOL4, SPITBOL for Linux}}
<syntaxhighlight lang="snobol4">
* Program: tokenize_with_escape.sbl
* To run: sbl tokenize_with_escape.sbl
* Description: Tokenize a string with escaping
* Comment: Tested using the Spitbol for Linux version of SNOBOL4
 
lf = substr(&alphabet,11,1) ;* New line or line feed
 
 
* Function tokenize will break parts out of a string, which are
* separated by c, which defaults to a comma, into
* an array. Parameter kp=1 to keep null parts, which is the default,
* and 0 to discard.
define('tokenize(s,c,kp)tokenizepat,part,t,i,j')
:(tokenize_end)
tokenize
c = (ident(c) ',', substr(c,1,1)) :f(freturn)
kp = (ident(kp) 1, eq(kp,0) 0, 1) :f(freturn)
t = table()
tokenizepat = breakx(c) . part c | (len(1) rem) . part
s ? eq(kp,1) rtab(1) c = s c
tokenize1
s ? tokenizepat = "" :f(tokenize2)
t[i = eq(kp,0) differ(part) i + 1] = part
t[i = eq(kp,1) i + 1] = part
:(tokenize1)
tokenize2
tokenize = array(i) :f(errr)
j = 0
tokenize3 tokenize[j = lt(j,i) j + 1] = t[j] :s(tokenize3)
:(return)
tokenize_end
 
 
* Function tokcan will a normalize a string by applying separator and escape
* rules to string ts. Parameter sep is the separator, while esc is the escape
* character. Parameter tesc is the new separator character to substitute for
* parameter sep. It defaults to a comma, ",".
define('tokcan(ts,sep,esc,tesc)tpat,part1,part2,notany') :(tokcan_end)
tokcan
tesc = (ident(tesc) ',', substr(tesc,1,1))
tpat = (breakx(sep esc) . part1
+ (sep | esc sep | esc esc | (esc len(1) . notany)) . part2
+ )
+ | (len(1) rem) . part1
 
tokcan1
ts ? tpat = :f(tokcan2)
part2 = (leq(part2,sep) tesc
+ ,leq(part2,esc sep) sep
+ ,leq(part2,esc esc) esc
+ ,differ(notany) leq(part2,esc notany) notany
+ )
tokcan = (ident(tokcan) "", tokcan) part1 part2
:(tokcan1)
tokcan2
:(return)
tokcan_end
 
 
test_string = "one^|uno||three^^^^|four^^^|^cuatro|"
sep = "|"
esc = "^"
 
hline = tokcan(test_string,sep,esc) :f(err)
 
 
output = " Input: " test_string lf
output = "Output1: " hline lf
 
output = "Output2: "
tokenized = tokenize(hline,",")
 
p1 output = "'" tokenized[z = z + 1] "'" :s(p1)
 
END
</syntaxhighlight>
{{out}}
<pre>
Input: one^|uno||three^^^^|four^^^|^cuatro|
 
Output1: one|uno,,three^^,four^|cuatro,
 
Output2:
'one|uno'
''
'three^^'
'four^|cuatro'
''
</pre>
 
Line 3,145 ⟶ 3,297:
{{trans|Rust}}
 
<langsyntaxhighlight lang="swift">extension String {
func tokenize(separator: Character, escape: Character) -> [String] {
var token = ""
Line 3,171 ⟶ 3,323:
}
 
print("one^|uno||three^^^^|four^^^|^cuatro|".tokenize(separator: "|", escape: "^"))</langsyntaxhighlight>
 
{{out}}
Line 3,179 ⟶ 3,331:
=={{header|Tcl}}==
Putting a coroutine in a TclOO object following the "generator pattern" gives a nice structure:
<langsyntaxhighlight Tcllang="tcl">oo::class create tokens {
constructor {s} {
puts [coroutine Next my Iter $s]
Line 3,212 ⟶ 3,364:
}
 
puts [tokenize one^|uno||three^^^^|four^^^|^cuatro| | ^]</langsyntaxhighlight>
 
{{out}}
Line 3,219 ⟶ 3,371:
=={{header|TMG}}==
Unix TMG:
<langsyntaxhighlight UnixTMGlang="unixtmg">prog: char(sep) *
char(esc) *
str: smark
Line 3,233 ⟶ 3,385:
ch: 0;
sep: 0;
esc: 0;</langsyntaxhighlight>
 
Input:
Line 3,248 ⟶ 3,400:
 
=={{header|VBA}}==
{{trans|Phix}}<langsyntaxhighlight lang="vb">Private Function tokenize(s As String, sep As String, esc As String) As Collection
Dim ret As New Collection
Dim this As String
Line 3,286 ⟶ 3,438:
Next i
Debug.Print Join(outstring, ", ")
End Sub</langsyntaxhighlight>{{out}}
<pre>one|uno, , three^^, four^|cuatro, </pre>
 
=={{header|V (Vlang)}}==
{{trans|Go}}
<syntaxhighlight lang="ecmascript">fn tokenize_string(s string, sep u8, escape u8) ?[]string {
mut tokens := []string{}
mut runes := []u8{}
mut in_escape := false
for r in s {
if in_escape {
in_escape = false
runes << r
} else if r == escape {
in_escape = true
} else if r == sep {
tokens << runes.bytestr()
runes = runes[..0]
} else {
runes << r
}
}
tokens << runes.bytestr()
if in_escape {
return error("invalid terminal escape")
}
return tokens
}
 
const sample = "one^|uno||three^^^^|four^^^|^cuatro|"
const separator = `|`
const escape = `^`
fn main() {
println("Input: $sample")
tokens := tokenize_string(sample, separator, escape)?
println("Tokens: $tokens")
}</syntaxhighlight>
 
{{out}}
<pre>
Input: one^|uno||three^^^^|four^^^|^cuatro|
Tokens: ['one|uno', '', 'three^^', 'four^|cuatro', '']
)
</pre>
 
=={{header|Wren}}==
{{trans|Kotlin}}
<langsyntaxhighlight ecmascriptlang="wren">var SPE = "\ufffe" // unused unicode character in Specials block
var SPF = "\uffff" // ditto
 
Line 3,304 ⟶ 3,498:
var esc = "^"
var items = tokenize.call(str, sep, esc)
for (item in items) System.print((item == "") ? "(empty)" : item)</langsyntaxhighlight>
 
{{out}}
Line 3,317 ⟶ 3,511:
=={{header|zkl}}==
Two simplifying assumptions (since their behavior is undefined): A string ending with an un-escaped escape is an error and 0xff is not an allowed character in the string.
<langsyntaxhighlight lang="zkl">fcn tokenize(str,sep,esc){
sink:=Sink(String);
foreach c in (str){
Line 3,327 ⟶ 3,521:
}
sink.close().split("\xff");
}</langsyntaxhighlight>
Or, if you prefer brevity:
<langsyntaxhighlight lang="zkl">fcn tokenize(str,sep,esc){
sink:=Sink(String);
foreach c in (str){ sink.write( (c==esc and __cWalker.next()) or (c==sep and "\xff") or c ) }
sink.close().split("\xff");
}</langsyntaxhighlight>
<langsyntaxhighlight lang="zkl">tokenize("one^|uno||three^^^^|four^^^|^cuatro|", "|","^").println();</langsyntaxhighlight>
{{out}}
<pre>L("one|uno","","three^^","four^|cuatro","")</pre>
9,482

edits