GSTrans string conversion: Difference between revisions
m (→{{header|Phix}}: utf8 note) |
m (→{{header|Phix}}: stress won't) |
||
Line 332: | Line 332: | ||
and in fact penned a hexstr() rather similar to the two routines actually asked for, just to improve the console display a little bit.<br> |
and in fact penned a hexstr() rather similar to the two routines actually asked for, just to improve the console display a little bit.<br> |
||
Also, the following always encodes to uppercase, but the decode part will properly cope with (eg) "|m|j|@|e|!t|m|!|?".<br> |
Also, the following always encodes to uppercase, but the decode part will properly cope with (eg) "|m|j|@|e|!t|m|!|?".<br> |
||
As per Wren, strings in Phix are just sequences of bytes: UTF-8 or similar is completely irrelevant here, and won't mess up byte subscripting. |
As per Wren, strings in Phix are just sequences of bytes: UTF-8 or similar is completely irrelevant here, and ''won't'' mess up byte subscripting. |
||
<!--<syntaxhighlight lang="phix">(phixonline)--> |
<!--<syntaxhighlight lang="phix">(phixonline)--> |
||
<span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span> |
<span style="color: #008080;">with</span> <span style="color: #008080;">javascript_semantics</span> |
Revision as of 02:51, 25 October 2023
You are encouraged to solve this task according to the task description, using any language you may know.
GSTrans string encoding is a method of encoding all 8-bit character values 0-255 with only printable characters. It originates on Acorn computers to allow command line commands to process non-printable characters.
Character Encoding 0-31 |letter eg |@, |A, |i |[ etc. 32-126 character, except for: " |" | || 127 |? 128-255 |! followed by encoding, eg |!|@ = 128
A string can be surrounded in quotes, eg "ALERT|G".
See http://www.riscos.com/support/developers/prm/conversions.html
Examples:
|LHello|G|J|M encodes CHR$12;"Hello";CHR$7;CHR$10;CHR$13 "|m|j|@|e|!t|m|!|?" encodes 13,10,0,5,244,13,255
- Task
- Write two functions, one to encode a string of characters into a GSTrans string, and one to decode a GSTrans string. Indicate if any error checking is done, and how it is indicated.
ALGOL 68
As with the Wren sample includes the Julia test cases, but Algol 68 implementations don't generally handle UTF-8 so the Unicode characters are encoded as separate bytes.
This does very little error checking - so invalid coded strings will probably decode to incorrect values (but the input was incorrect anyway...).
Quoted strings retain their quotes when encoded or decoded.
Control characters are shown as their decimal values, enclosed in "[" and "]"/
BEGIN # GSTrans string conversion #
OP UNQUOTE = ( STRING s )STRING: # returns s unquoted #
IF LWB s >= UPB s THEN s
ELIF s[ LWB s ] /= """" OR s[ UPB s ] /= """" THEN s
ELSE s[ LWB s + 1 : UPB s - 1 ]
FI # UNQUOTE # ;
OP ENCODE = ( STRING str )STRING: # returns str encoded #
BEGIN
STRING result := "";
STRING s = UNQUOTE str;
FOR i FROM LWB s TO UPB s DO
INT c = ABS s[ i ];
result +:= IF c < 32 THEN
"|" + REPR ( c + 64 )
ELIF c = ABS """" OR c = ABS "|" THEN
"|" + s[ i ]
ELIF c >= 32 AND c <= 126 THEN
s[ i ]
ELIF c = 127 THEN
"|?"
ELSE
"|!" + ENCODE STRING( REPR( c - 128 ) )
FI
OD;
IF s /= str THEN """" + result + """" ELSE result FI
END # ENCODE # ;
OP DECODE = ( STRING str )STRING: # returns str decoded #
BEGIN
STRING result := "";
STRING s = UNQUOTE str;
INT i := LWB s;
WHILE i <= UPB s DO
result +:= IF s[ i ] /= "|" THEN
s[ i ]
ELIF ( i +:= 1 ) > UPB s THEN
""
ELIF s[ i ] = """" OR s[ i ] = "|" THEN
s[ i ]
ELIF s[ i ] = "?" THEN
REPR 127
ELIF s[ i ] /= "!" THEN
REPR ( ABS s[ i ] - 64 )
ELSE
i +:= 1;
IF i > UPB s THEN
""
ELIF s[ i ] /= "|" THEN
REPR ( ABS s[ i ] + 128 )
ELIF ( i +:= 1 ) > UPB s THEN
""
ELSE
STRING c = DECODE STRING( "|" + s[ i ] );
REPR ( ABS c[ LWB c ] + 128 )
FI
FI;
i +:= 1
OD;
IF s /= str THEN """" + result + """" ELSE result FI
END # DECODE # ;
OP SHOWBYTES = ( STRING s )STRING: # return s with control characters #
BEGIN # replaced by their value #
STRING result := "";
FOR i FROM LWB s TO UPB s DO
INT c = ABS s[ i ];
result +:= IF c < 32 THEN
"[" + whole( c, 0 ) + "]"
ELSE
s[ i ]
FI
OD;
result
END # SHOWBYTES # ;
[]STRING test = ( "ALERT|G", "wert↑", "@♂aN°$ª7Î" # test cases #
, "ÙC▼æÔt6¤☻Ì", """@)Ð♠qhýÌÿ", "+☻#o9$u♠©A" # from Julia #
, "♣àlæi6Ú.é", "ÏÔ♀È♥@ë", "Rç÷\%◄MZûhZ"
, "ç>¾AôVâ♫↓P"
, REPR 12 + "Hello" + REPR 7 + REPR 10 + REPR 13 # Task test cases #
, REPR 13 + REPR 10 + REPR 0 + REPR 5 + REPR 244 + REPR 13 + REPR 255
, """quoted|text""" # quoted test case #
);
FOR i FROM LWB test TO UPB test DO
STRING encoded = ENCODE test[ i ];
STRING decoded = DECODE encoded;
print( ( SHOWBYTES test[ i ], " -> ", encoded, " -> ", SHOWBYTES decoded
, IF decoded = test[ i ] THEN "" ELSE " ****" FI, newline
)
)
OD
END
- Output:
ALERT|G -> ALERT||G -> ALERT|G wert↑ -> wert|!b|!|F|!|Q -> wert↑ @♂aN°$ª7ÃŽ -> @|!b|!|Y|!|BaN|!B|!0$|!B|!*7|!C|!|N -> @♂aN°$ª7ÃŽ ÙC▼æÔt6¤☻Ì -> |!C|!|YC|!b|!|V|!<|!C|!&|!C|!|Tt6|!B|!$|!b|!|X|!;|!C|!|L -> ÙC▼æÔt6¤☻Ì "@)Ãâ™ qhýÌÿ -> |"@)|!C|!|P|!b|!|Y|! qh|!C|!=|!C|!|L|!C|!? -> "@)Ãâ™ qhýÌÿ +☻#o9$u♠©A -> +|!b|!|X|!;#o9$u|!b|!|Y|! |!B|!)A -> +☻#o9$u♠©A ♣à læi6Ú.é -> |!b|!|Y|!#|!C|! l|!C|!&i6|!C|!|Z.|!C|!) -> ♣à læi6Ú.é ÃÔ♀È♥@ë -> |!C|!|O|!C|!|T|!b|!|Y|!|@|!C|!|H|!b|!|Y|!%@|!C|!+ -> ÃÔ♀È♥@ë Rç÷\%â—„MZûhZ -> R|!C|!'|!C|!7\%|!b|!|W|!|DMZ|!C|!;hZ -> Rç÷\%â—„MZûhZ ç>¾AôVâ♫↓P -> |!C|!'>|!B|!>A|!C|!4V|!C|!|"|!b|!|Y|!+|!b|!|F|!|SP -> ç>¾AôVâ♫↓P [12]Hello[7][10][13] -> |LHello|G|J|M -> [12]Hello[7][10][13] [13][10][0][5]ô[13]ÿ -> |M|J|@|E|!t|M|!|? -> [13][10][0][5]ô[13]ÿ "quoted|text" -> "quoted||text" -> "quoted|text"
BBC BASIC
10 REM > GSTrans.bbc
20 REM GSTrans in BASIC
30 REM J.G.Harston
40 :
50 REPEAT
60 INPUT LINE "GSstring: "A$
70 A$=FNGS_Decode(A$,0)
80 A$=FNGS_Encode(A$)
90 PRINT A$
100 UNTIL FALSE
110 END
120 :
130 :
140 :
150 REM Decode a GSTrans string
160 REM On entry: inp$=GSTransed string
170 REM flg%=0 - parse whole string, *KEY style
180 REM =1 - parse until space, filename style (not implemented)
190 REM Returns: decoded string
200 DEFFNGS_Decode(inp$,flg%)
210 LOCAL out$,byte%,set%
220 IF LEFT$(inp$,1)=" ":REPEAT:inp$=MID$(inp$,2):UNTIL LEFT$(inp$,1)<>" "
230 IF LEFT$(inp$,1)="""":IF RIGHT$(inp$,1)="""":inp$=MID$(inp$,2,LENinp$-2)
240 IF inp$="":=""
250 REPEAT
260 byte%=-1:set%=0
270 IF LEFT$(inp$,2)="|!":set%=128:inp$=MID$(inp$,3)
280 IF LEFT$(inp$,1)="|":byte%=ASCMID$(inp$,2,1)AND31
290 IF LEFT$(inp$,2)="||":byte%=ASC"|"
300 IF LEFT$(inp$,2)="|?":byte%=127
310 IF LEFT$(inp$,2)="|""":byte%=34
320 IF LEFT$(inp$,2)="""""":byte%=34
330 IF byte%<0:byte%=ASC(inp$):inp$=MID$(inp$,2) ELSE inp$=MID$(inp$,3)
340 out$=out$+CHR$(set%+byte%)
350 UNTIL inp$=""
360 =out$
370 :
380 REM Encode into a GSTrans string
390 REM On entry: inp$=raw string
400 REM Returns: GSTrans string
410 DEFFNGS_Encode(inp$)
420 LOCAL out$,byte%
430 IF inp$="":=""""""
440 REPEAT
450 byte%=ASC(inp$):inp$=MID$(inp$,2)
460 IF byte%>127:out$=out$+"|!":byte%=byte% AND 127
470 IF byte%>31 AND byte%<>ASC"""" AND byte%<>ASC"|" AND byte%<>127:out$=out$+CHR$(byte%)
480 IF byte%<32:out$=out$+"|"+CHR$(byte%+64)
490 IF byte%=ASC"""":out$=out$+""""""
500 IF byte%=ASC"|":out$=out$+"||"
510 IF byte%=127:out$=out$+"|?"
520 UNTIL inp$=""
530 =""""+out$+""""
540 :
No checks for string lengths is done. On decoding, invalid encodings are ignored and skipped, for instance |4 is decoded as 4.
Julia
"""
ASCII code Symbols used
0 |@
1 - 26 |letter eg |A (or |a) = ASCII 1, |M (or |m) = ASCII 13
27 |[ or |{
28 |\
29 |] or |}
30 |^ or |~
31 |_ or |' (grave accent)
32 - 126 keyboard character, except for:
" |"
| ||
< |<
127 |?
128 - 255 |!coded symbol eg ASCII 128 = |!|@ ASCII 129 = |!|A
See also www.riscos.com/support/developers/prm/conversions.html
"""
"""
GSTrans_encode(arr::Vector{Char})
To avoid Unicode multibyte glitches, handle as vector of Chars, but we throw
an assertion error if any are multibyte (so, 0 <= integer value of char <= 255).
"""
function GSTrans_encode(arr::Vector{Char})
function GSTChar_encode(c::Char)
i = Int(c)
@assert 0 <= i <= 255 "Char value of $c, $i, is out of range"
resultchars = Char[]
if 0 <= i <= 31
push!(resultchars, '|', Char(64 + i))
elseif c == '"'
push!(resultchars, '|', '"')
elseif c == '|'
push!(resultchars, '|', '|')
elseif i == 127
push!(resultchars, '|', '?')
elseif 128 <= i <= 255 # |! then recurse after subtracting 128
push!(resultchars, '|', '!', GSTChar_encode(Char(i - 128))...)
else
push!(resultchars, c)
end
return resultchars
end
return String(mapreduce(GSTChar_encode, vcat, arr, init = Char[]))
end
"""
GSTrans_encode(str::AbstractString)
Encode after converting a potentially Unicode string to codeunit bytes and
then to a vector of ascii Chars, then pass this to encoding routine for the vector
"""
GSTrans_encode(str::AbstractString) = GSTrans_encode(Char.(transcode(UInt8, str)))
function GSTrans_decode(str::AbstractString)
result = UInt8[]
gotbar, gotbang, bangadd = false, false, 0
for c in str
if gotbang
if c == '|'
bangadd = 128
gotbar = true
else
push!(result, Char(Int(c) + 128))
end
gotbang = false
elseif gotbar
if c == '?'
push!(result, Char(127 + bangadd))
elseif c == '!'
gotbang = true
elseif c == '|' || c == '"' || c == '<'
push!(result, Char(Int(c) + bangadd))
elseif c == '[' || c == '{'
push!(result, Char(27 + bangadd))
elseif c == '\\'
push!(result, Char(28 + bangadd))
elseif c == ']' || c == '}'
push!(result, Char(29 + bangadd))
elseif c == '^' || c == '~'
push!(result, Char(30 + bangadd))
elseif c == '_' || c == '`'
push!(result, Char(31 + bangadd))
else
push!(result, Char(Int(uppercase(c)) - 64 + bangadd))
end
gotbar, bangadd = false, 0
elseif c == '|'
gotbar = true
else
push!(result, Char(c))
end
end
return String(result)
end
const TESTS = ["ALERT|G", "wert↑"]
const RAND_TESTS = [String(Char.(rand(0:255, 10))) for _ in 1:8]
const DECODE_TESTS = ["|LHello|G|J|M", "|m|j|@|e|!t|m|!|?"]
for t in [TESTS; RAND_TESTS]
encoded = GSTrans_encode(t)
decoded = GSTrans_decode(encoded)
println("String $t encoded is: $encoded, decoded is: $decoded.")
@assert t == decoded
end
for enc in DECODE_TESTS
print("Encoded string $enc decoded is: ")
display(GSTrans_decode(enc))
end
- Output:
String ALERT|G encoded is: ALERT||G, decoded is: ALERT|G. String wert↑ encoded is: wert|!b|!|F|!|Q, decoded is: wert↑. String @♂aN°$ª7Î encoded is: @|KaN|!B|!0$|!B|!*7|!B|!|R|!C|!|N, decoded is: @♂aN°$ª7Î. String ÙC▼æÔt6¤☻Ì encoded is: |!C|!|YC|_|!C|!&|!C|!|Tt6|!B|!$|B|!C|!|L, decoded is: ÙC▼æÔt6¤☻Ì. String "@)Ð♠qhýÌÿ encoded is: |"@)|!C|!|P|Fqh|!C|!=|!C|!|L|!C|!?, decoded is: "@)Ð♠qhýÌÿ. String +☻#o9$u♠©A encoded is: +|B#o9$u|F|!B|!)A, decoded is: +☻#o9$u♠©A. String ♣àlæi6Ú.é encoded is: |E|!C|! l|!B|!|K|!C|!&i6|!C|!|Z.|!C|!), decoded is: ♣àlæi6Ú.é. String ÏÔ♀È♥@ë encoded is: |!C|!|O|!C|!|T|!B|!|[|Lj|!C|!|H|C@|!B|!|I|!C|!+, decoded is: ÏÔ♀È♥@ë. String Rç÷%◄MZûhZ encoded is: R|!C|!'|!C|!7%|QMZ|!C|!;hZ, decoded is: Rç÷%◄MZûhZ. String ç>¾AôVâ♫↓P encoded is: |!C|!'>|!B|!>A|!C|!4V|!C|!|"|N|YP, decoded is: ç>¾AôVâ♫↓P. Encoded string |LHello|G|J|M decoded is: "\fHello\a\n\r" Encoded string |m|j|@|e|!t|m|!|? decoded is: "\r\n\0\x05\xf4\r\xff"
Phix
Note all those unicode strings work fine in a browser and on linux, but look horrible in a windows console, so I left them out,
and in fact penned a hexstr() rather similar to the two routines actually asked for, just to improve the console display a little bit.
Also, the following always encodes to uppercase, but the decode part will properly cope with (eg) "|m|j|@|e|!t|m|!|?".
As per Wren, strings in Phix are just sequences of bytes: UTF-8 or similar is completely irrelevant here, and won't mess up byte subscripting.
with javascript_semantics function GSTrans_encode(string s) string res = "" for b in s do if b>=128 then res &= "|!" b -= 128 end if if b<' ' then res &= "|"&('@'+b) else integer k = find(b,`"|<`) if k then res &= "|"&b elsif b='\x7F' then res &= "|?" else res &= b end if end if end for return res end function function GSTrans_decode(string s) string res = "" bool bar = false integer hb = #00 for b in s do if bar then if b='!' then assert(hb==#00) hb = #80 else if b='?' then b = #7F elsif not find(b,`"|<`) then b -= iff(b>='a'?#60:#40) end if res &= b+hb hb = #00 end if bar = false elsif b='|' then bar = true else res &= b+hb hb = #00 end if end for return res end function function hexstr(string s) string res = "" for b in s do if b>=' ' and b<='~' then res &= b else integer k = find(b,"\r\n\t\0") if k then res &= '\\'&("rnt0"[k]) else res &= sprintf("\\x%02x",b) end if end if end for return res end function constant tests = {"\x0CHello\x07\n\r", "\r\n\0\x05\xF4\r\xFF"} for t in tests do string e = GSTrans_encode(t), d = GSTrans_decode(e), ht = hexstr(t), he = hexstr(e) printf(1,"%s <-> %s (decoded same:%t)\n",{ht,he,d=t}) end for
- Output:
\x0CHello\x07\n\r <-> |LHello|G|J|M (decoded same:true) \r\n\0\x05\xF4\r\xFF <-> |M|J|@|E|!t|M|!|? (decoded same:true)
Wren
Strings in Wren are just an immutable array of bytes. They are usually interpreted as UTF-8 but don't have to be. Unicode characters in the example Julia strings are therefore encoded using their constituent UTF-8 bytes which decodes fine but may not give the same encoding as Julia itself.
If an invalid byte (following the "|" flag) is encountered whilst decoding, it is decoded as if the flag were not present.
Where strings contain control characters, their decoded version is printed to the terminal as a byte list.
class GSTrans {
static encode(s, upper) {
if (!(s is String && s.count > 0)) Fiber.abort("Argument must be a non-empty string.")
// remove any outer quotation marks
if (s.count > 1 && s[0] == "\"" && s[-1] == "\"") s = s[1..-2]
// helper function to encode bytes < 128
var f = Fn.new { |b|
if (b >= 1 && b <= 26) {
return "|" + (upper ? String.fromByte(b + 64) : String.fromByte(b + 96))
} else if (b < 32) {
return "|" + String.fromByte(b + 64)
} else if (b == 34) { // quotation mark
return "|\""
} else if (b == 60) { // less than
return "|<"
} else if (b == 124) { // vertical bar
return "||"
} else if (b == 127) { // DEL
return "|?"
} else {
return String.fromByte(b)
}
}
var enc = ""
// iterate through the string's bytes encoding as we go
for (b in s.bytes) {
if (b < 128) {
enc = enc + f.call(b)
} else {
enc = enc + "|!" + f.call(b - 128)
}
}
return enc
}
static decode(s) {
if (!(s is String && s.count > 0)) Fiber.abort("Argument must be a non-empty string.")
// remove any outer quotation marks
if (s.count > 1 && s[0] == "\"" && s[-1] == "\"") s = s[1..-2]
// helper function for decoding bytes after "|"
var f = Fn.new { |b|
if (b == 34) { // quotation mark
return 34
} else if (b == 60) { // less than
return 60
} else if (b == 63) { // question mark
return 127
} else if (b >= 64 && b < 96) { // @ + upper case letter + [\]^_
return b - 64
} else if (b == 96) { // grave accent
return 31
} else if (b == 124) { // vertical bar
return 124
} else if (b >= 97 && b < 127) { // lower case letter + {}~
return b - 96
} else {
return b
}
}
var bytes = s.bytes.toList
var bc = bytes.count
var i = 0
var dec = ""
// iterate through the string's bytes decoding as we go
while (i < bc) {
if (bytes[i] != 124) {
dec = dec + String.fromByte(bytes[i])
i = i + 1
} else {
if (i < bc - 1 && bytes[i+1] != 33) {
dec = dec + String.fromByte(f.call(bytes[i+1]))
i = i + 2
} else {
if (i < bc - 2 && bytes[i+2] != 124) {
dec = dec + String.fromByte(128 + bytes[i+2])
i = i + 3
} else if (i < bc - 3 && bytes[i+2] == 124) {
dec = dec + String.fromByte(128 + f.call(bytes[i+3]))
i = i + 4
} else {
i = i + 1
}
}
}
}
return dec
}
}
var strings = [
"\fHello\a\n\r",
"\r\n\0\x05\xf4\r\xff"
]
var texts = [
"""\fHello\a\n\r""",
"""\r\n\0\x05\xf4\r\xff"""
]
var uppers = [true, false]
for (i in 0...strings.count) {
var s = strings[i]
var t = "\"" + texts[i] +"\""
var u = uppers[i]
var enc = GSTrans.encode(s, u)
var dec = GSTrans.decode(enc)
System.print("string: %(t)")
System.print("encoded (%(u ? "upper" : "lower")) : %(enc)")
System.print("decoded (as byte list): %(dec.bytes.toList)")
System.print("string == decoded ? %(dec == s)\n")
}
var jstrings = [
"ALERT|G",
"wert↑",
"@♂aN°$ª7Î",
"ÙC▼æÔt6¤☻Ì",
"\"@)Ð♠qhýÌÿ",
"+☻#o9$u♠©A",
"♣àlæi6Ú.é",
"ÏÔ♀È♥@ë",
"Rç÷\%◄MZûhZ",
"ç>¾AôVâ♫↓P"
]
System.print("Julia strings: string -> encoded (upper) <- decoded (same or different)\n")
for (s in jstrings) {
var enc = GSTrans.encode(s, true)
var dec = GSTrans.decode(enc)
var same = (s == dec)
System.print(" %(s) -> %(enc) <- %(dec) (%(same ? "same" : "different"))")
}
- Output:
string: "\fHello\a\n\r" encoded (upper) : |LHello|G|J|M decoded (as byte list): [12, 72, 101, 108, 108, 111, 7, 10, 13] string == decoded ? true string: "\r\n\0\x05\xf4\r\xff" encoded (lower) : |m|j|@|e|!t|m|!|? decoded (as byte list): [13, 10, 0, 5, 244, 13, 255] string == decoded ? true Julia strings: string -> encoded (upper) <- decoded (same or different) ALERT|G -> ALERT||G <- ALERT|G (same) wert↑ -> wert|!b|!|F|!|Q <- wert↑ (same) @♂aN°$ª7Î -> @|!b|!|Y|!|BaN|!B|!0$|!B|!*7|!C|!|N <- @♂aN°$ª7Î (same) ÙC▼æÔt6¤☻Ì -> |!C|!|YC|!b|!|V|!|<|!C|!&|!C|!|Tt6|!B|!$|!b|!|X|!;|!C|!|L <- ÙC▼æÔt6¤☻Ì (same) "@)Ð♠qhýÌÿ -> |"@)|!C|!|P|!b|!|Y|! qh|!C|!=|!C|!|L|!C|!? <- "@)Ð♠qhýÌÿ (same) +☻#o9$u♠©A -> +|!b|!|X|!;#o9$u|!b|!|Y|! |!B|!)A <- +☻#o9$u♠©A (same) ♣àlæi6Ú.é -> |!b|!|Y|!#|!C|! l|!C|!&i6|!C|!|Z.|!C|!) <- ♣àlæi6Ú.é (same) ÏÔ♀È♥@ë -> |!C|!|O|!C|!|T|!b|!|Y|!|@|!C|!|H|!b|!|Y|!%@|!C|!+ <- ÏÔ♀È♥@ë (same) Rç÷%◄MZûhZ -> R|!C|!'|!C|!7%|!b|!|W|!|DMZ|!C|!;hZ <- Rç÷%◄MZûhZ (same) ç>¾AôVâ♫↓P -> |!C|!'>|!B|!>A|!C|!4V|!C|!|"|!b|!|Y|!+|!b|!|F|!|SP <- ç>¾AôVâ♫↓P (same)