UTF-8 encode and decode

From Rosetta Code
Revision as of 16:34, 24 April 2017 by rosettacode>Aspectcl (another take in Tcl)
Task
UTF-8 encode and decode
You are encouraged to solve this task according to the task description, using any language you may know.

As described in UTF-8 and in Wikipedia, UTF-8 is a popular encoding of (multi-byte) Unicode code-points into eight-bit octets.

The goal of this task is to write a encoder that takes a unicode code-point (an integer representing a unicode character) and returns a sequence of 1-4 bytes representing that character in the UTF-8 encoding.

Then you have to write the corresponding decoder that takes a sequence of 1-4 UTF-8 encoded bytes and return the corresponding unicode character.

Demonstrate the functionality of your encoder and decoder on the following five characters:

Character   Name                                  Unicode    UTF-8 encoding (hex)
---------------------------------------------------------------------------------
A           LATIN CAPITAL LETTER A                U+0041     41
ö           LATIN SMALL LETTER O WITH DIAERESIS   U+00F6     C3 B6
Ж           CYRILLIC CAPITAL LETTER ZHE           U+0416     D0 96
€           EURO SIGN                             U+20AC     E2 82 AC
𝄞           MUSICAL SYMBOL G CLEF                 U+1D11E    F0 9D 84 9E

Provided below is a reference implementation in Common Lisp.

Common Lisp

Helper functions

<lang lisp> (defun ascii-byte-p (octet)

 "Return t if octet is a single-byte 7-bit ASCII char.
 The most significant bit is 0, so the allowed pattern is 0xxx xxxx."
 (assert (typep octet 'integer))
 (assert (<= (integer-length octet) 8))
 (let ((bitmask  #b10000000)
       (template #b00000000))
   ;; bitwise and the with the bitmask #b11000000 to extract the first two bits.
   ;; check if the first two bits are equal to the template #b10000000.
   (= (logand bitmask octet) template)))

(defun multi-byte-p (octet)

 "Return t if octet is a part of a multi-byte UTF-8 sequence.
 The multibyte pattern is 1xxx xxxx. A multi-byte can be either a lead byte or a trail byte."
 (assert (typep octet 'integer))
 (assert (<= (integer-length octet) 8))
 (let ((bitmask  #b10000000)
       (template #b10000000))
   ;; bitwise and the with the bitmask #b11000000 to extract the first two bits.
   ;; check if the first two bits are equal to the template #b10000000.
   (= (logand bitmask octet) template)))

(defun lead-byte-p (octet)

 "Return t if octet is one of the leading bytes of an UTF-8 sequence, nil otherwise.
 Allowed leading byte patterns are 0xxx xxxx, 110x xxxx, 1110 xxxx and 1111 0xxx."
 (assert (typep octet 'integer))
 (assert (<= (integer-length octet) 8))
 (let ((bitmasks  (list #b10000000 #b11100000 #b11110000 #b11111000))
       (templates (list #b00000000 #b11000000 #b11100000 #b11110000)))
   (some #'(lambda (a b) (= (logand a octet) b)) bitmasks templates)))

(defun n-trail-bytes (octet)

 "Take a leading utf-8 byte, return the number of continuation bytes 1-3."
 (assert (typep octet 'integer))
 (assert (<= (integer-length octet) 8))
 (let ((bitmasks  (list #b10000000 #b11100000 #b11110000 #b11111000))
       (templates (list #b00000000 #b11000000 #b11100000 #b11110000)))
   (loop for i from 0 to 3
      when (= (nth i templates) (logand (nth i bitmasks) octet))
      return i)))

</lang>

Encoder

<lang lisp> (defun unicode-to-utf-8 (int)

 "Take a unicode code point, return a list of one to four UTF-8 encoded bytes (octets)."
 (assert (<= (integer-length int) 21))
 (let ((n-trail-bytes (cond ((<= #x00000 int #x00007F) 0)
                            ((<= #x00080 int #x0007FF) 1)
                            ((<= #x00800 int #x00FFFF) 2)
                            ((<= #x10000 int #x10FFFF) 3)))
       (lead-templates (list #b00000000 #b11000000 #b11100000 #b11110000))
       (trail-template #b10000000)
       ;; number of content bits in the lead byte.
       (n-lead-bits (list 7 5 4 3))
       ;; number of content bits in the trail byte.
       (n-trail-bits 6)
       ;; list to put the UTF-8 encoded bytes in.
       (byte-list nil))
   (if (= n-trail-bytes 0)
       ;; if we need 0 trail bytes, ist just an ascii single byte.
       (push int byte-list)
       (progn
         ;; if we need more than one byte, first fill the trail bytes with 6 bits each.
         (loop for i from 0 to (1- n-trail-bytes)
            do (push (+ trail-template
                        (ldb (byte n-trail-bits (* i n-trail-bits)) int))
                     byte-list))
         ;; then copy the remaining content bytes to the lead byte.
         (push (+ (nth n-trail-bytes lead-templates)
                  (ldb (byte (nth n-trail-bytes n-lead-bits) (* n-trail-bytes n-trail-bits)) int))
               byte-list)))
   ;; return the list of UTF-8 encoded bytes.
   byte-list))

</lang>

Decoder

<lang lisp> (defun utf-8-to-unicode (byte-list)

 "Take a list of one to four utf-8 encoded bytes (octets), return a code point."
 (let ((b1 (car byte-list)))
   (cond ((ascii-byte-p b1) b1) ; if a single byte, just return it.
         ((multi-byte-p b1)
          (if (lead-byte-p b1)
              (let ((n (n-trail-bytes b1))
                    ;; Content bits we want to extract from each lead byte.
                    (lead-templates (list #b01111111 #b00011111 #b00001111 #b00000111))
                    ;; Content bits we want to extract from each trail byte.
                    (trail-template #b00111111))
                (if (= n (1- (list-length byte-list)))
                    ;; add lead byte
                    (+ (ash (logand (nth 0 byte-list) (nth n lead-templates)) (* 6 n))
                       ;; and the trail bytes
                       (loop for i from 1 to n sum
                            (ash (logand (nth i byte-list) trail-template) (* 6 (- n i)))))
                    (error "calculated number of bytes doesnt match the length of the byte list")))
              (error "first byte in the list isnt a lead byte"))))))

</lang>

The test

<lang lisp> (defun test-utf-8 ()

 "Return t if the chosen unicode points are encoded and decoded correctly."
 (let* ((unicodes-orig (list 65 246 1046 8364 119070))
        (unicodes-test (mapcar #'(lambda (x) (utf-8-to-unicode (unicode-to-utf-8 x)))
                               unicodes-orig)))
   (mapcar #'(lambda (x)
               (format t
                       "character ~A, code point: ~6x, utf-8: ~{~x ~}~%"
                       (code-char x)
                       x
                       (unicode-to-utf-8 x)))
           unicodes-orig)
   ;; return t if all are t
   (every #'= unicodes-orig unicodes-test)))

</lang>

Test output

<lang lisp> CL-USER> (test-utf-8) character A, code point: 41, utf-8: 41 character ö, code point: F6, utf-8: C3 B6 character Ж, code point: 416, utf-8: D0 96 character €, code point: 20AC, utf-8: E2 82 AC character 𝄞, code point: 1D11E, utf-8: F0 9D 84 9E T </lang>

Go

<lang go>package main

import (

   "fmt"
   "unicode/utf8"

)

func utf8encode(codepoint rune) []byte {

   buffer := make([]byte, 4)
   length := utf8.EncodeRune(buffer, codepoint)
   return buffer[:length]

}

func utf8decode(bytes []byte) rune {

   result, _ := utf8.DecodeRune(bytes)
   return result

}

func main() {

       fmt.Printf("%-7s %7s\t%s\t%s\n", "Char", "Unicode", "UTF-8 encoded", "Decoded");
   for _, codepoint := range []rune{'A', 'ö', 'Ж', '€', '𝄞'} {
       encoded := utf8encode(codepoint)
       decoded := utf8decode(encoded)
       fmt.Printf("%-7c U+%04X\t%-12X\t%c\n", codepoint, codepoint, encoded, decoded)
   }

}</lang>

Output:
Char    Unicode	UTF-8 encoded	Decoded
A       U+0041	41          	A
ö       U+00F6	C3B6        	ö
Ж       U+0416	D096        	Ж
€       U+20AC	E282AC      	€
𝄞       U+1D11E	F09D849E    	𝄞

Alternately: <lang go>package main

import (

   "fmt"

)

func utf8encode(codepoint rune) []byte {

   return []byte(string([]rune{codepoint}))

}

func utf8decode(bytes []byte) rune {

   return []rune(string(bytes))[0]

}

func main() {

       fmt.Printf("%-7s %7s\t%s\t%s\n", "Char", "Unicode", "UTF-8 encoded", "Decoded");
   for _, codepoint := range []rune{'A', 'ö', 'Ж', '€', '𝄞'} {
       encoded := utf8encode(codepoint)
       decoded := utf8decode(encoded)
       fmt.Printf("%-7c U+%04X\t%-12X\t%c\n", codepoint, codepoint, encoded, decoded)
   }

}</lang>

Output:
Char    Unicode	UTF-8 encoded	Decoded
A       U+0041	41          	A
ö       U+00F6	C3B6        	ö
Ж       U+0416	D096        	Ж
€       U+20AC	E282AC      	€
𝄞       U+1D11E	F09D849E    	𝄞

Java

Works with: Java version 7+

<lang java>import java.util.Formatter; import java.io.UnsupportedEncodingException;

public class UTF8EncodeDecode {

   public static byte[] utf8encode(int codepoint) throws UnsupportedEncodingException {
       return new String(new int[]{codepoint}, 0, 1).getBytes("UTF-8");
   }
   public static int utf8decode(byte[] bytes) throws UnsupportedEncodingException {
       return new String(bytes, "UTF-8").codePointAt(0);
   }
   public static final void main(String[] args) throws UnsupportedEncodingException {
       System.out.printf("%-7s %-43s %7s\t%s\t%7s\n", "Char", "Name", "Unicode", "UTF-8 encoded", "Decoded");
       for (int codepoint : new int[]{0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E}) {
           byte[] encoded = utf8encode(codepoint);
           Formatter formatter = new Formatter();
           for (byte b : encoded) {
               formatter.format("%02X ", b);
           }
           String encodedHex = formatter.toString();
           int decoded = utf8decode(encoded);
           System.out.printf("%-7c %-43s U+%04X\t%-12s\tU+%04X\n", codepoint, Character.getName(codepoint), codepoint, encodedHex, decoded);
       }
   }
   public static final void main(String[] args) throws UnsupportedEncodingException {
       System.out.printf("%-7s %-43s %7s\t%s\t%s\n", "Char", "Name", "Unicode", "UTF-8 encoded", "Decoded");
       for (int codepoint : new int[]{0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E}) {
           byte[] encoded = utf8encode(codepoint);
           Formatter formatter = new Formatter();
           for (byte b : encoded) {
               formatter.format("%02X ", b);
           }
           String encodedHex = formatter.toString();
           int decoded = utf8decode(encoded);
           System.out.printf("%-7c %-43s U+%04X\t%-12s\t%c\n", codepoint, Character.getName(codepoint), codepoint, encodedHex, decoded);
       }
   }

}</lang>

Output:
Char    Name                                        Unicode	UTF-8 encoded	Decoded
A       LATIN CAPITAL LETTER A                      U+0041	41          	A
ö       LATIN SMALL LETTER O WITH DIAERESIS         U+00F6	C3 B6       	ö
Ж       CYRILLIC CAPITAL LETTER ZHE                 U+0416	D0 96       	Ж
€       EURO SIGN                                   U+20AC	E2 82 AC    	€
𝄞      MUSICAL SYMBOL G CLEF                       U+1D11E	F0 9D 84 9E 	𝄞

Perl 6

Works with: Rakudo version 2017.02

Pretty much all built in to the language. <lang perl6>say sprintf("%-18s %-34s %7s %7s\t%s %s\n", 'Character', 'Name', 'Ordinal', 'Unicode', 'UTF-8 encoded', 'decoded'), '-' x 94;

for < A ö Ж € 𝄞 😜 > -> $char {

   printf "   %-7s %-43s %6s U+%04s\t%12s %4s\n", $char, $char.uniname, $char.ord,
     $char.ord.base(16), $char.encode('UTF8').list».base(16).Str, $char.encode('UTF8').decode;

}</lang>

Output:
Character          Name                               Ordinal Unicode	UTF-8 encoded  decoded
----------------------------------------------------------------------------------------------
   A       LATIN CAPITAL LETTER A                          65 U+0041	          41    A
   ö       LATIN SMALL LETTER O WITH DIAERESIS            246 U+00F6	       C3 B6    ö
   Ж       CYRILLIC CAPITAL LETTER ZHE                   1046 U+0416	       D0 96    Ж
   €       EURO SIGN                                     8364 U+20AC	    E2 82 AC    €
   𝄞       MUSICAL SYMBOL G CLEF                       119070 U+1D11E	 F0 9D 84 9E    𝄞
   😜      FACE WITH STUCK-OUT TONGUE AND WINKING EYE  128540 U+1F61C	 F0 9F 98 9C    😜

Phix

Standard autoinclude, see the manual and/or builtins/utfconv.e ( http://phix.x10.mx/docs/html/utfconv.htm and/or https://bitbucket.org/petelomax/phix/src )
As requested in the task description: <lang Phix>constant tests = {#0041, #00F6, #0416, #20AC, #1D11E}

function hex(sequence s, string fmt) -- output helper

   for i=1 to length(s) do
       s[i] = sprintf(fmt,s[i])
   end for
   return join(s,',')

end function

for i=1 to length(tests) do

   integer codepoint = tests[i]
   sequence s = utf32_to_utf8({codepoint}),
            r = utf8_to_utf32(s)
   printf(1,"#%04x -> {%s} -> {%s}\n",{codepoint, hex(s,"#%02x"),hex(r,"#%04x")})

end for</lang>

Output:
#0041 -> {#41} -> {#0041}
#00F6 -> {#C3,#B6} -> {#00F6}
#0416 -> {#D0,#96} -> {#0416}
#20AC -> {#E2,#82,#AC} -> {#20AC}
#1D11E -> {#F0,#9D,#84,#9E} -> {#1D11E}

Racket

<lang racket>#lang racket

(define char-map

 '((LATIN-CAPITAL-LETTER-A              .  #\U0041)
   (LATIN-SMALL-LETTER-O-WITH-DIAERESIS .  #\U00F6)
   (CYRILLIC-CAPITAL-LETTER-ZHE         .  #\U0416)
   (EURO-SIGN                           .  #\U20AC)
   (MUSICAL-SYMBOL-G-CLEF               .  #\U1D11E)))

(for ((name.char (in-list char-map)))

 (define name (car name.char))
 (define chr (cdr name.char))
 (let ((bites (bytes->list (string->bytes/utf-8 (list->string (list chr))))))
   (printf "~s\t~a\t~a\t~a\t~a~%" chr chr
           (map (curryr number->string 16) bites)
           (bytes->string/utf-8 (list->bytes bites))
           name)))</lang>
Output:
#\A	A	(41)	A	LATIN-CAPITAL-LETTER-A
#\ö	ö	(c3 b6)	ö	LATIN-SMALL-LETTER-O-WITH-DIAERESIS
#\Ж	Ж	(d0 96)	Ж	CYRILLIC-CAPITAL-LETTER-ZHE
#\€	€	(e2 82 ac)	€	EURO-SIGN
#\𝄞	𝄞	(f0 9d 84 9e)	𝄞	MUSICAL-SYMBOL-G-CLEF

Sidef

<lang ruby>func utf8_encoder(Number code) {

   code.chr.encode('UTF-8').bytes.map{.chr}

}

func utf8_decoder(Array bytes) {

   bytes.map{.ord}.decode('UTF-8')

}

for n in ([0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E]) {

   var encoded = utf8_encoder(n)
   var decoded = utf8_decoder(encoded)
   assert_eq(n, decoded.ord)
   say "#{decoded} -> #{encoded}"

}</lang>

Output:
A -> ["A"]
ö -> ["\xC3", "\xB6"]
Ж -> ["\xD0", "\x96"]
€ -> ["\xE2", "\x82", "\xAC"]
𝄞 -> ["\xF0", "\x9D", "\x84", "\x9E"]

Tcl

Note: Tcl can handle Unicodes only up to U+FFFD, i.e. the Basic Multilingual Plane (BMP, 16 bits wide). Therefore, the fifth test fails as expected. <lang Tcl>proc encoder int {

  set u [format %c $int]
  set bytes {}
  foreach byte [split [encoding convertto utf-8 $u] ""] {
     lappend bytes [format %02X [scan $byte %c]]
  }
  return $bytes

} proc decoder bytes {

  set str {}
  foreach byte $bytes {
     append str [format %c [scan $byte %x]]
  }
  return [encoding convertfrom utf-8 $str]

} foreach test {0x0041 0x00f6 0x0416 0x20ac 0x1d11e} {

  set res $test
  lappend res [encoder $test] -> [decoder [encoder $test]]
  puts $res

}</lang>

0x0041 41 -> A
0x00f6 {C3 B6} -> ö
0x0416 {D0 96} -> Ж
0x20ac {E2 82 AC} -> €
0x1d11e {EF BF BD} -> �

Alternative Implementation

While perhaps not as readable as the above, this version handles beyond-BMP codepoints by manually composing the utf-8 byte sequences and emitting raw bytes to the console. encoding convertto utf-8 command still does the heavy lifting where it can.

<lang Tcl>proc utf8 {codepoint} {

   scan $codepoint %llx cp
   if {$cp < 0x10000} {
       set str [format \\u%04x $cp]                ;# \uXXXX
       set str [subst $str]                        ;# substitute per Tcl rules
       set bytes [encoding convertto utf-8 $str]   ;# encode
   } else {                                        ;# codepoints beyond the BMP need manual approach
       set bits [format %021b $cp]                 ;# format as binary string
       set unibits    11110[string range $bits 0 2];# insert extra bits for utf-8 4-byte encoding
       append unibits 10[string range $bits 3 8]
       append unibits 10[string range $bits 9 14]
       append unibits 10[string range $bits 15 20]
       set bytes [binary format B* $unibits]       ;# turn into a sequence of bytes
   }
   return $bytes

}

proc hexchars {s} {

   binary scan $s H* hex
   regsub -all .. $hex {\0 }

}

  1. for the test, we assume the tty is in utf-8 mode and can handle beyond-BMP chars
  2. so set output mode to binary so we can write raw bytes!

chan configure stdout -encoding binary foreach codepoint { 41 F6 416 20AC 1D11E } {

   set utf8 [utf8 $codepoint]
   puts "[format U+%04s $codepoint]\t$utf8\t[hexchars $utf8]"

}</lang>

Output:
U+0041  A       41

U+00F6 ö c3 b6 U+0416 Ж d0 96 U+20AC € e2 82 ac U+1D11E 𝄞 f0 9d 84 9e

zkl

<lang zkl>println("Char Unicode UTF-8"); foreach utf,unicode_int in (T( T("\U41;",0x41), T("\Uf6;",0xf6),

     T("\U416;",0x416), T("\U20AC;",0x20ac), T("\U1D11E;",0x1d11e))){
  utf_int:=utf.reduce(fcn(s,c){ 0x100*s + c.toAsc() },0);
  char :=unicode_int.toString(-8);	// Unicode int to UTF-8 string
  // UTF-8 bytes to UTF-8 string:
  char2:=Data(Void,utf_int.toBigEndian(utf_int.len())).text;
  println("%s %s %9s  %x".fmt(char,char2,"U+%x".fmt(unicode_int),utf_int));

}</lang> Int.len() --> number of bytes in int. This could be hard coded because UTF-8 has a max of 6 bytes and (0x41).toBigEndian(6) --> 0x41,0,0,0,0,0 which is a zero terminated string ("A");

Output:
Char  Unicode  UTF-8
A A      U+41  41
ö ö      U+f6  c3b6
Ж Ж     U+416  d096
€ €    U+20ac  e282ac
𝄞 𝄞   U+1d11e  f09d849e