GSTrans string conversion: Difference between revisions

Content added Content deleted

Inline

Revision as of 12:50, 24 October 2023

GSTrans string encoding is a method of encoding all 8-bit character values 0-255 with only printable characters. It originates on Acorn computers to allow command line commands to process non-printable characters.

 Character Encoding
 0-31      |letter eg |@, |A, |i |[ etc.
 32-126    character, except for:
 "         |"
 |         ||
 127       |?
 128-255   |! followed by encoding, eg |!|@ = 128

A string can be surrounded in quotes, eg "ALERT|G".

See http://www.riscos.com/support/developers/prm/conversions.html

Examples:

 |LHello|G|J|M       encodes  CHR$12;"Hello";CHR$7;CHR$10;CHR$13
 "|m|j|@|e|!t|m|!|?" encodes  13,10,0,5,244,13,255

Task

Write two functions, one to encode a string of characters into a GSTrans string, and one to decode a GSTrans string. Indicate if any error checking is done, and how it is indicated.

BBC BASIC

   10 REM > GSTrans.bbc
   20 REM GSTrans in BASIC
   30 REM J.G.Harston
   40 :
   50 REPEAT
   60   INPUT LINE "GSstring: "A$
   70   A$=FNGS_Decode(A$,0)
   80   A$=FNGS_Encode(A$)
   90   PRINT A$
  100 UNTIL FALSE
  110 END
  120 :
  130 :
  140 :
  150 REM Decode a GSTrans string
  160 REM On entry: inp$=GSTransed string
  170 REM           flg%=0 - parse whole string, *KEY style
  180 REM               =1 - parse until space, filename style (not implemented)
  190 REM Returns:  decoded string
  200 DEFFNGS_Decode(inp$,flg%)
  210 LOCAL out$,byte%,set%
  220 IF LEFT$(inp$,1)=" ":REPEAT:inp$=MID$(inp$,2):UNTIL LEFT$(inp$,1)<>" "
  230 IF LEFT$(inp$,1)="""":IF RIGHT$(inp$,1)="""":inp$=MID$(inp$,2,LENinp$-2)
  240 IF inp$="":=""
  250 REPEAT
  260   byte%=-1:set%=0
  270   IF LEFT$(inp$,2)="|!":set%=128:inp$=MID$(inp$,3)
  280   IF LEFT$(inp$,1)="|":byte%=ASCMID$(inp$,2,1)AND31
  290   IF LEFT$(inp$,2)="||":byte%=ASC"|"
  300   IF LEFT$(inp$,2)="|?":byte%=127
  310   IF LEFT$(inp$,2)="|""":byte%=34
  320   IF LEFT$(inp$,2)="""""":byte%=34
  330   IF byte%<0:byte%=ASC(inp$):inp$=MID$(inp$,2) ELSE inp$=MID$(inp$,3)
  340   out$=out$+CHR$(set%+byte%)
  350 UNTIL inp$=""
  360 =out$
  370 :
  380 REM Encode into a GSTrans string
  390 REM On entry: inp$=raw string
  400 REM Returns:  GSTrans string
  410 DEFFNGS_Encode(inp$)
  420 LOCAL out$,byte%
  430 IF inp$="":=""""""
  440 REPEAT
  450   byte%=ASC(inp$):inp$=MID$(inp$,2)
  460   IF byte%>127:out$=out$+"|!":byte%=byte% AND 127
  470   IF byte%>31 AND byte%<>ASC"""" AND byte%<>ASC"|" AND byte%<>127:out$=out$+CHR$(byte%)
  480   IF byte%<32:out$=out$+"|"+CHR$(byte%+64)
  490   IF byte%=ASC"""":out$=out$+""""""
  500   IF byte%=ASC"|":out$=out$+"||"
  510   IF byte%=127:out$=out$+"|?"
  520 UNTIL inp$=""
  530 =""""+out$+""""
  540 :

No checks for string lengths is done. On decoding, invalid encodings are ignored and skipped, for instance |4 is decoded as 4.

Julia

""" 
ASCII code	Symbols used
0	        |@
1 - 26	    |letter eg |A (or |a) = ASCII 1, |M (or |m) = ASCII 13
27	        |[ or |{
28	        |\
29	        |] or |}
30	        |^ or |~
31	        |_ or |' (grave accent)
32 - 126	keyboard character, except for:
"	        |"
|	        ||
<	        |<
127	        |?
128 - 255	|!coded symbol eg ASCII 128 = |!|@ ASCII 129 = |!|A

See also www.riscos.com/support/developers/prm/conversions.html
"""

""" 
    GSTrans_encode(arr::Vector{Char})

    To avoid Unicode multibyte glitches, handle as vector of Chars, but we throw 
    an assertion error if any are multibyte (so, 0 <= integer value of char <= 255).
"""
function GSTrans_encode(arr::Vector{Char})
    function GSTChar_encode(c::Char)
        i = Int(c)
        @assert 0 <= i <= 255 "Char value of $c, $i, is out of range"
        resultchars = Char[]
        if 0 <= i <= 31
            push!(resultchars, '|', Char(64 + i))
        elseif c == '"'
            push!(resultchars, '|', '"')
        elseif c == '|'
            push!(resultchars, '|', '|')
        elseif i == 127
            push!(resultchars, '|', '?')
        elseif 128 <= i <= 255 # |! then recurse after subtracting 128
            push!(resultchars, '|', '!', GSTChar_encode(Char(i - 128))...)
        else
            push!(resultchars, c)
        end
        return resultchars
    end
    return String(mapreduce(GSTChar_encode, vcat, arr, init = Char[]))
end

""" 
    GSTrans_encode(str::AbstractString)

    Encode after converting a potentially Unicode string to codeunit bytes and 
    then to a vector of ascii Chars, then pass this to encoding routine for the vector
"""
GSTrans_encode(str::AbstractString) = GSTrans_encode(Char.(transcode(UInt8, str)))

function GSTrans_decode(str::AbstractString)
    result = UInt8[]
    gotbar, gotbang, bangadd = false, false, 0
    for c in str
        if gotbang
            if c == '|'
                bangadd = 128
                gotbar = true
            else
                push!(result, Char(Int(c) + 128))
            end
            gotbang = false
        elseif gotbar       
            if c == '?'
                push!(result, Char(127 + bangadd))
            elseif c == '!'
                gotbang = true
            elseif c == '|' || c == '"' || c == '<'
                push!(result, Char(Int(c) + bangadd))
            elseif c == '[' || c == '{'
                push!(result, Char(27 + bangadd))
            elseif c == '\\'
                push!(result, Char(28 + bangadd))
            elseif c == ']' || c == '}'
                push!(result, Char(29 + bangadd))
            elseif c == '^' || c == '~'
                push!(result, Char(30 + bangadd))
            elseif c == '_' || c == '`'
                push!(result, Char(31 + bangadd))
            else
                push!(result, Char(Int(uppercase(c)) - 64 + bangadd))
            end
            gotbar, bangadd = false, 0
        elseif c == '|'
                gotbar = true
        else
            push!(result, Char(c))
        end
    end
    return String(result)
end

const TESTS = ["ALERT|G", "wert↑"]
const RAND_TESTS = [String(Char.(rand(0:255, 10))) for _ in 1:8]
const DECODE_TESTS = ["|LHello|G|J|M", "|m|j|@|e|!t|m|!|?"]

for t in [TESTS; RAND_TESTS]
    encoded = GSTrans_encode(t)
    decoded = GSTrans_decode(encoded)
    println("String $t encoded is: $encoded, decoded is: $decoded.")
    @assert t == decoded
end

for enc in DECODE_TESTS
    print("Encoded string $enc decoded is: ")
    display(GSTrans_decode(enc))
end

Output:

String ALERT|G encoded is: ALERT||G, decoded is: ALERT|G.
String wert↑ encoded is: wert|!b|!|F|!|Q, decoded is: wert↑.
String @♂aN°$ª7Î encoded is: @|KaN|!B|!0$|!B|!*7|!B|!|R|!C|!|N, decoded is: @♂aN°$ª7Î.
String ÙC▼æÔt6¤☻Ì encoded is: |!C|!|YC|_|!C|!&|!C|!|Tt6|!B|!$|B|!C|!|L, decoded is: ÙC▼æÔt6¤☻Ì.
String "@)Ð♠qhýÌÿ encoded is: |"@)|!C|!|P|Fqh|!C|!=|!C|!|L|!C|!?, decoded is: "@)Ð♠qhýÌÿ.
String +☻#o9$u♠©A encoded is: +|B#o9$u|F|!B|!)A, decoded is: +☻#o9$u♠©A.
String ♣àlæi6Ú.é encoded is: |E|!C|! l|!B|!|K|!C|!&i6|!C|!|Z.|!C|!), decoded is: ♣àlæi6Ú.é.
String ÏÔ♀È♥@ë encoded is: |!C|!|O|!C|!|T|!B|!|[|Lj|!C|!|H|C@|!B|!|I|!C|!+, decoded is: ÏÔ♀È♥@ë.
String Rç÷%◄MZûhZ encoded is: R|!C|!'|!C|!7%|QMZ|!C|!;hZ, decoded is: Rç÷%◄MZûhZ.
String ç>¾AôVâ♫↓P encoded is: |!C|!'>|!B|!>A|!C|!4V|!C|!|"|N|YP, decoded is: ç>¾AôVâ♫↓P.
Encoded string |LHello|G|J|M decoded is: "\fHello\a\n\r"
Encoded string |m|j|@|e|!t|m|!|? decoded is: "\r\n\0\x05\xf4\r\xff"

Wren

Strings in Wren are just an immutable array of bytes. They are usually interpreted as UTF-8 but don't have to be. Unicode characters in the example Julia strings are therefore encoded using their constituent UTF-8 bytes which decodes fine but may not give the same encoding as Julia itself.

If an invalid byte (following the "|" flag) is encountered whilst decoding, it is decoded as if the flag were not present.

Where strings contain control characters, their decoded version is printed to the terminal as a byte list.

class GSTrans {
    static encode(s, upper) {
        if (!(s is String && s.count > 0)) Fiber.abort("Argument must be a non-empty string.")

        // remove any outer quotation marks
        if (s.count > 1 && s[0] == "\"" && s[-1] == "\"") s = s[1..-2]

        // helper function to encode bytes < 128
        var f = Fn.new { |b|
            if (b >= 1 && b <= 26) {
                return "|" + (upper ? String.fromByte(b + 64) : String.fromByte(b + 96))
            } else if (b < 32) {
                return "|" + String.fromByte(b + 64)
            } else if (b == 34)  { // quotation mark           
                return "|\""
            } else if (b == 60)  { // less than
                return "|<"
            } else if (b == 124) { // vertical bar
                return "||"
            } else if (b == 127) { // DEL
                return "|?"
            } else {
                return String.fromByte(b)
            }
         }

         var enc = ""

         // iterate through the string's bytes encoding as we go
         for (b in s.bytes) {
             if (b < 128) {
                enc = enc + f.call(b)
             } else {
                enc = enc + "|!" + f.call(b - 128)
             }
         }

         return enc
    }

    static decode(s) {
        if (!(s is String && s.count > 0)) Fiber.abort("Argument must be a non-empty string.")

        // remove any outer quotation marks
        if (s.count > 1 && s[0] == "\"" && s[-1] == "\"") s = s[1..-2]

        // helper function for decoding bytes after "|"
        var f = Fn.new { |b|
            if (b == 34)                     { // quotation mark
                return 34
            } else if (b == 60)              { // less than
                return 60
            } else if (b == 63)              { // question mark
                return 127
            } else if (b >= 64 && b < 96)    { // @ + upper case letter + [\]^_
                return b - 64
            } else if (b == 96)              { // grave accent
                return 31
            } else if (b == 124)             { // vertical bar
                return 124
            } else if (b >= 97 && b < 127)   { // lower case letter + {}~
                return b - 96
            } else {
                return b
            }
        }

        var bytes = s.bytes.toList
        var bc = bytes.count
        var i = 0
        var dec = ""

        // iterate through the string's bytes decoding as we go
        while (i < bc) {
            if (bytes[i] != 124) {
                dec = dec + String.fromByte(bytes[i])
                i = i + 1
            } else {
                if (i < bc - 1 && bytes[i+1] != 33) {
                    dec = dec + String.fromByte(f.call(bytes[i+1]))
                    i = i + 2
                } else {
                    if (i < bc - 2 && bytes[i+2] != 124) {
                        dec = dec + String.fromByte(128 + bytes[i+2])
                        i = i + 3
                    } else if (i < bc - 3 && bytes[i+2] == 124) {
                        dec = dec + String.fromByte(128 + f.call(bytes[i+3]))
                        i = i + 4 
                    } else {
                        i = i + 1
                    }
                }
            }
        }
        return dec
    }
}

var strings = [
    "\fHello\a\n\r",
    "\r\n\0\x05\xf4\r\xff"
]

var texts = [
    """\fHello\a\n\r""",
    """\r\n\0\x05\xf4\r\xff"""
]

var uppers = [true, false]

for (i in 0...strings.count) {
    var s = strings[i]
    var t = "\"" + texts[i] +"\""
    var u = uppers[i]
    var enc = GSTrans.encode(s, u)
    var dec = GSTrans.decode(enc)
    System.print("string: %(t)")
    System.print("encoded (%(u ? "upper" : "lower")) : %(enc)")
    System.print("decoded (as byte list): %(dec.bytes.toList)")
    System.print("string == decoded ? %(dec == s)\n")
}

var jstrings = [
    "ALERT|G",
    "wert↑",
    "@♂aN°$ª7Î",
    "ÙC▼æÔt6¤☻Ì",
    "\"@)Ð♠qhýÌÿ",
    "+☻#o9$u♠©A",
    "♣àlæi6Ú.é",
    "ÏÔ♀È♥@ë",
    "Rç÷\%◄MZûhZ",
    "ç>¾AôVâ♫↓P"
]

System.print("Julia strings: string -> encoded (upper) <- decoded (same or different)\n")
for (s in jstrings) {
    var enc = GSTrans.encode(s, true)
    var dec = GSTrans.decode(enc)
    var same = (s == dec)
    System.print("  %(s) -> %(enc) <- %(dec) (%(same ? "same" : "different"))")
}

Output:

string: "\fHello\a\n\r"
encoded (upper) : |LHello|G|J|M
decoded (as byte list): [12, 72, 101, 108, 108, 111, 7, 10, 13]
string == decoded ? true

string: "\r\n\0\x05\xf4\r\xff"
encoded (lower) : |m|j|@|e|!t|m|!|?
decoded (as byte list): [13, 10, 0, 5, 244, 13, 255]
string == decoded ? true

Julia strings: string -> encoded (upper) <- decoded (same or different)

  ALERT|G -> ALERT||G <- ALERT|G (same)
  wert↑ -> wert|!b|!|F|!|Q <- wert↑ (same)
  @♂aN°$ª7Î -> @|!b|!|Y|!|BaN|!B|!0$|!B|!*7|!C|!|N <- @♂aN°$ª7Î (same)
  ÙC▼æÔt6¤☻Ì -> |!C|!|YC|!b|!|V|!|<|!C|!&|!C|!|Tt6|!B|!$|!b|!|X|!;|!C|!|L <- ÙC▼æÔt6¤☻Ì (same)
  "@)Ð♠qhýÌÿ -> |"@)|!C|!|P|!b|!|Y|! qh|!C|!=|!C|!|L|!C|!? <- "@)Ð♠qhýÌÿ (same)
  +☻#o9$u♠©A -> +|!b|!|X|!;#o9$u|!b|!|Y|! |!B|!)A <- +☻#o9$u♠©A (same)
  ♣àlæi6Ú.é -> |!b|!|Y|!#|!C|! l|!C|!&i6|!C|!|Z.|!C|!) <- ♣àlæi6Ú.é (same)
  ÏÔ♀È♥@ë -> |!C|!|O|!C|!|T|!b|!|Y|!|@|!C|!|H|!b|!|Y|!%@|!C|!+ <- ÏÔ♀È♥@ë (same)
  Rç÷%◄MZûhZ -> R|!C|!'|!C|!7%|!b|!|W|!|DMZ|!C|!;hZ <- Rç÷%◄MZûhZ (same)
  ç>¾AôVâ♫↓P -> |!C|!'>|!B|!>A|!C|!4V|!C|!|"|!b|!|Y|!+|!b|!|F|!|SP <- ç>¾AôVâ♫↓P (same)