UTF-8 encode and decode: Difference between revisions

Content added Content deleted

Inline

Revision as of 21:39, 14 April 2018

As described in UTF-8 and in Wikipedia, UTF-8 is a popular encoding of (multi-byte) Unicode code-points into eight-bit octets.

The goal of this task is to write a encoder that takes a unicode code-point (an integer representing a unicode character) and returns a sequence of 1-4 bytes representing that character in the UTF-8 encoding.

Then you have to write the corresponding decoder that takes a sequence of 1-4 UTF-8 encoded bytes and return the corresponding unicode character.

Demonstrate the functionality of your encoder and decoder on the following five characters:

Character   Name                                  Unicode    UTF-8 encoding (hex)
---------------------------------------------------------------------------------
A           LATIN CAPITAL LETTER A                U+0041     41
ö           LATIN SMALL LETTER O WITH DIAERESIS   U+00F6     C3 B6
Ж           CYRILLIC CAPITAL LETTER ZHE           U+0416     D0 96
€           EURO SIGN                             U+20AC     E2 82 AC
𝄞           MUSICAL SYMBOL G CLEF                 U+1D11E    F0 9D 84 9E

Provided below is a reference implementation in Common Lisp.

Common Lisp

Helper functions

<lang lisp> (defun ascii-byte-p (octet)

 "Return t if octet is a single-byte 7-bit ASCII char.
 The most significant bit is 0, so the allowed pattern is 0xxx xxxx."
 (assert (typep octet 'integer))
 (assert (<= (integer-length octet) 8))
 (let ((bitmask  #b10000000)
       (template #b00000000))
   ;; bitwise and the with the bitmask #b11000000 to extract the first two bits.
   ;; check if the first two bits are equal to the template #b10000000.
   (= (logand bitmask octet) template)))

(defun multi-byte-p (octet)

 "Return t if octet is a part of a multi-byte UTF-8 sequence.
 The multibyte pattern is 1xxx xxxx. A multi-byte can be either a lead byte or a trail byte."
 (assert (typep octet 'integer))
 (assert (<= (integer-length octet) 8))
 (let ((bitmask  #b10000000)
       (template #b10000000))
   ;; bitwise and the with the bitmask #b11000000 to extract the first two bits.
   ;; check if the first two bits are equal to the template #b10000000.
   (= (logand bitmask octet) template)))

(defun lead-byte-p (octet)

 "Return t if octet is one of the leading bytes of an UTF-8 sequence, nil otherwise.
 Allowed leading byte patterns are 0xxx xxxx, 110x xxxx, 1110 xxxx and 1111 0xxx."
 (assert (typep octet 'integer))
 (assert (<= (integer-length octet) 8))
 (let ((bitmasks  (list #b10000000 #b11100000 #b11110000 #b11111000))
       (templates (list #b00000000 #b11000000 #b11100000 #b11110000)))
   (some #'(lambda (a b) (= (logand a octet) b)) bitmasks templates)))

(defun n-trail-bytes (octet)

 "Take a leading utf-8 byte, return the number of continuation bytes 1-3."
 (assert (typep octet 'integer))
 (assert (<= (integer-length octet) 8))
 (let ((bitmasks  (list #b10000000 #b11100000 #b11110000 #b11111000))
       (templates (list #b00000000 #b11000000 #b11100000 #b11110000)))
   (loop for i from 0 to 3
      when (= (nth i templates) (logand (nth i bitmasks) octet))
      return i)))

</lang>

Encoder

<lang lisp> (defun unicode-to-utf-8 (int)

 "Take a unicode code point, return a list of one to four UTF-8 encoded bytes (octets)."
 (assert (<= (integer-length int) 21))
 (let ((n-trail-bytes (cond ((<= #x00000 int #x00007F) 0)
                            ((<= #x00080 int #x0007FF) 1)
                            ((<= #x00800 int #x00FFFF) 2)
                            ((<= #x10000 int #x10FFFF) 3)))
       (lead-templates (list #b00000000 #b11000000 #b11100000 #b11110000))
       (trail-template #b10000000)
       ;; number of content bits in the lead byte.
       (n-lead-bits (list 7 5 4 3))
       ;; number of content bits in the trail byte.
       (n-trail-bits 6)
       ;; list to put the UTF-8 encoded bytes in.
       (byte-list nil))
   (if (= n-trail-bytes 0)
       ;; if we need 0 trail bytes, ist just an ascii single byte.
       (push int byte-list)
       (progn
         ;; if we need more than one byte, first fill the trail bytes with 6 bits each.
         (loop for i from 0 to (1- n-trail-bytes)
            do (push (+ trail-template
                        (ldb (byte n-trail-bits (* i n-trail-bits)) int))
                     byte-list))
         ;; then copy the remaining content bytes to the lead byte.
         (push (+ (nth n-trail-bytes lead-templates)
                  (ldb (byte (nth n-trail-bytes n-lead-bits) (* n-trail-bytes n-trail-bits)) int))
               byte-list)))
   ;; return the list of UTF-8 encoded bytes.
   byte-list))

</lang>

Decoder

<lang lisp> (defun utf-8-to-unicode (byte-list)

 "Take a list of one to four utf-8 encoded bytes (octets), return a code point."
 (let ((b1 (car byte-list)))
   (cond ((ascii-byte-p b1) b1) ; if a single byte, just return it.
         ((multi-byte-p b1)
          (if (lead-byte-p b1)
              (let ((n (n-trail-bytes b1))
                    ;; Content bits we want to extract from each lead byte.
                    (lead-templates (list #b01111111 #b00011111 #b00001111 #b00000111))
                    ;; Content bits we want to extract from each trail byte.
                    (trail-template #b00111111))
                (if (= n (1- (list-length byte-list)))
                    ;; add lead byte
                    (+ (ash (logand (nth 0 byte-list) (nth n lead-templates)) (* 6 n))
                       ;; and the trail bytes
                       (loop for i from 1 to n sum
                            (ash (logand (nth i byte-list) trail-template) (* 6 (- n i)))))
                    (error "calculated number of bytes doesnt match the length of the byte list")))
              (error "first byte in the list isnt a lead byte"))))))

</lang>

The test

<lang lisp> (defun test-utf-8 ()

 "Return t if the chosen unicode points are encoded and decoded correctly."
 (let* ((unicodes-orig (list 65 246 1046 8364 119070))
        (unicodes-test (mapcar #'(lambda (x) (utf-8-to-unicode (unicode-to-utf-8 x)))
                               unicodes-orig)))
   (mapcar #'(lambda (x)
               (format t
                       "character ~A, code point: ~6x, utf-8: ~{~x ~}~%"
                       (code-char x)
                       x
                       (unicode-to-utf-8 x)))
           unicodes-orig)
   ;; return t if all are t
   (every #'= unicodes-orig unicodes-test)))

</lang>

Test output

<lang lisp> CL-USER> (test-utf-8) character A, code point: 41, utf-8: 41 character ö, code point: F6, utf-8: C3 B6 character Ж, code point: 416, utf-8: D0 96 character €, code point: 20AC, utf-8: E2 82 AC character 𝄞, code point: 1D11E, utf-8: F0 9D 84 9E T </lang>

C

include <stdio.h>
include <stdlib.h>
include <inttypes.h>

typedef struct { char mask; /* the char data is in these bits */ char lead; /* the start bytes of a utf-8 encoded char */ uint32_t beg; /* beginning of codepoint range */ uint32_t end; /* end of codepoint range */ }utf_t;

utf_t * utf[] = { [0] = &(utf_t){0b00111111, 0b10000000, 0, 0 }, [1] = &(utf_t){0b01111111, 0b00000000, 0000, 0177 }, [2] = &(utf_t){0b00011111, 0b11000000, 0200, 03777 }, [3] = &(utf_t){0b00001111, 0b11100000, 04000, 0177777 }, [4] = &(utf_t){0b00000111, 0b11110000, 0200000, 04177777}, &(utf_t){0}, };

/* All lengths are in bytes */ int codepoint_len(const uint32_t cp); /* len of associated utf-8 char */ int utf8_len(const char ch); /* len of utf-8 encoded char */

char *to_utf8(const uint32_t cp); uint32_t to_cp(const char chr[4]);

int codepoint_len(const uint32_t cp) { int len = 0; int i = 1; for(utf_t **u = utf; u; ++u) { if((cp >= (*u)->beg) && (cp <= (*u)->end)) { break; } ++len; } if(len > 4) /* Out of bounds */ exit(1);

return len; }

int utf8_len(const char ch) { int len = 0; for(utf_t **u = utf; u; ++u) { if((ch & ~(*u)->mask) == (*u)->lead) { break; } ++len; } if(len > 4) { /* Malformed leading byte */ exit(1); } return len; }

char *to_utf8(const uint32_t cp) { static char ret[5]; const int bytes = codepoint_len(cp);

int shift = 0; for(int i = bytes - 1; i; --i, shift += 6) { ret[i] = (cp >> shift & utf[0]->mask) | utf[0]->lead; } ret[0] = (cp >> shift & utf[bytes]->mask) | utf[bytes]->lead; ret[bytes] = '\0'; return ret; }

uint32_t to_cp(const char chr[4]) { int bytes = utf8_len(*chr); int shift = 6 * (bytes - 1); uint32_t codep = (*chr++ & utf[bytes]->mask) << shift;

for(int i = 1; i < bytes; ++i, ++chr) { shift -= 6; codep |= ((char)*chr & utf[0]->mask) << shift; }

return codep; }

int main(void) { const uint32_t *input = (uint32_t []){0x0041, 0x00f6, 0x0416, 0x20ac, 0x1d11e, 0x0};

printf("Character Unicode UTF-8 encoding (hex)\n"); printf("----------------------------------------\n");

char *utf8; uint32_t codepoint; for(; *input; ++input) { utf8 = to_utf8(*input); codepoint = to_cp(utf8); printf("%s U+%-7.4x", utf8, codepoint);

for(int i = 0; utf8[i] && i < 4; ++i) { printf("%hhx ", utf8[i]); } printf("\n"); } return 0; } </lang> Output <lang> Character Unicode UTF-8 encoding (hex)

A U+0041 41 ö U+00f6 c3 b6 Ж U+0416 d0 96 € U+20ac e2 82 ac 𝄞 U+1d11e f0 9d 84 9e

</lang>

D

<lang D>import std.conv; import std.stdio;

immutable CHARS = ["A","ö","Ж","€","𝄞"];

void main() {

   writeln("Character   Code-Point   Code-Units");
   foreach (c; CHARS) {
       auto bytes = cast(ubyte[]) c; //The raw bytes of a character can be accessed by casting
       auto unicode = cast(uint) to!dstring(c)[0]; //Convert from a UTF8 string to a UTF32 string, and cast the first character to a number
       writefln("%s              %7X   [%(%X, %)]", c, unicode, bytes);
   }

}</lang>

Output:

Character   Code-Point   Code-Units
A                   41   [41]
ö                   F6   [C3, B6]
Ж                  416   [D0, 96]
€                 20AC   [E2, 82, AC]
𝄞                1D11E   [F0, 9D, 84, 9E]

Elena

ELENA 3.3 : <lang elena>import system'routines. import extensions.

literal extension op {

   literal printAsString
   [
      console print(self," ")
   ]

   literal printAsUTF8Array    
   [
       self toByteArray; forEach(:b) [ console print(b toLiteral(16)," ") ].
   ]
   
   printAsUTF32
   [
       self toArray; forEach(:c)[ console print("U+",c toInt; toLiteral(16)," ")  ].
   ]

}

program = [

   "A" printAsString; printAsUTF8Array; printAsUTF32.
   console printLine.
   
   "ö" printAsString; printAsUTF8Array; printAsUTF32.
   console printLine.

   "Ж" printAsString; printAsUTF8Array; printAsUTF32.
   console printLine.

   "€" printAsString; printAsUTF8Array; printAsUTF32.
   console printLine.

   "𝄞" printAsString; printAsUTF8Array; printAsUTF32.
   console printLine.

].</lang>

Output:

A 41 U+41 
ö C3 B6 U+F6 
Ж D0 96 U+416 
€ E2 82 AC U+20AC 
𝄞 F0 9D 84 9E U+1D11E

F#

<lang fsharp> // Unicode character point to UTF8. Nigel Galloway: March 19th., 2018 let fN g = match List.findIndex (fun n->n>g) [0x80;0x800;0x10000;0x110000] with

          |0->[g]
          |1->[0xc0+(g&&&0x7c0>>>6);0x80+(g&&&0x3f)]
          |2->[0xe0+(g&&&0xf000>>>12);0x80+(g&&&0xfc0>>>6);0x80+(g&&&0x3f)]
          |_->[0xf0+(g&&&0x1c0000>>>18);0x80+(g&&&0x3f000>>>12);0x80+(g&&&0xfc0>>>6);0x80+(g&&&0x3f)]

</lang>

Output:

for n in fN 0x41    do printf "%x " n -> 41
for n in fN 0xf6    do printf "%x " n -> c3 b6 
for n in fN 0x416   do printf "%x " n -> d0 96 
for n in fN 0x20ac  do printf "%x " n -> e2 82 ac 
for n in fN 0x1d11e do printf "%x " n -> f0 9d 84 9e

Go

Implementation

This implementation is missing all checks for invalid data and so is not production-ready, but illustrates the basic UTF-8 encoding scheme. <lang go>package main

import (

   "bytes"
   "encoding/hex"
   "fmt"
   "log"
   "strings"

)

var testCases = []struct {

   rune
   string

}{

   {'A', "41"},
   {'ö', "C3 B6"},
   {'Ж', "D0 96"},
   {'€', "E2 82 AC"},
   {'𝄞', "F0 9D 84 9E"},

}

func main() {

   for _, tc := range testCases {
       // derive some things from test data
       u := fmt.Sprintf("U+%04X", tc.rune)
       b, err := hex.DecodeString(strings.Replace(tc.string, " ", "", -1))
       if err != nil {
           log.Fatal("bad test data")
       }
       // exercise encoder and decoder on test data
       e := encodeUTF8(tc.rune)
       d := decodeUTF8(b)
       // show function return values
       fmt.Printf("%c  %-7s  %X\n", d, u, e)
       // validate return values against test data
       if !bytes.Equal(e, b) {
           log.Fatal("encodeUTF8 wrong")
       }
       if d != tc.rune {
           log.Fatal("decodeUTF8 wrong")
       }
   }

}

const (

   // first byte of a 2-byte encoding starts 110 and carries 5 bits of data
   b2Lead = 0xC0 // 1100 0000
   b2Mask = 0x1F // 0001 1111

   // first byte of a 3-byte encoding starts 1110 and carries 4 bits of data
   b3Lead = 0xE0 // 1110 0000
   b3Mask = 0x0F // 0000 1111

   // first byte of a 4-byte encoding starts 11110 and carries 3 bits of data
   b4Lead = 0xF0 // 1111 0000
   b4Mask = 0x07 // 0000 0111

   // non-first bytes start 10 and carry 6 bits of data
   mbLead = 0x80 // 1000 0000
   mbMask = 0x3F // 0011 1111

)

func encodeUTF8(r rune) []byte {

   switch i := uint32(r); {
   case i <= 1<<7-1: // max code point that encodes into a single byte
       return []byte{byte(r)}
   case i <= 1<<11-1: // into two bytes
       return []byte{
           b2Lead | byte(r>>6),
           mbLead | byte(r)&mbMask}
   case i <= 1<<16-1: // three
       return []byte{
           b3Lead | byte(r>>12),
           mbLead | byte(r>>6)&mbMask,
           mbLead | byte(r)&mbMask}
   default:
       return []byte{
           b4Lead | byte(r>>18),
           mbLead | byte(r>>12)&mbMask,
           mbLead | byte(r>>6)&mbMask,
           mbLead | byte(r)&mbMask}
   }

}

func decodeUTF8(b []byte) rune {

   switch b0 := b[0]; {
   case b0 < 0x80:
       return rune(b0)
   case b0 < 0xE0:
       return rune(b0&b2Mask)<<6 |
           rune(b[1]&mbMask)
   case b0 < 0xF0:
       return rune(b0&b3Mask)<<12 |
           rune(b[1]&mbMask)<<6 |
           rune(b[2]&mbMask)
   default:
       return rune(b0&b4Mask)<<18 |
           rune(b[1]&mbMask)<<12 |
           rune(b[2]&mbMask)<<6 |
           rune(b[3]&mbMask)
   }

}</lang>

Output:

A  U+0041   41
ö  U+00F6   C3B6
Ж  U+0416   D096
€  U+20AC   E282AC
𝄞  U+1D11E  F09D849E

Library/language

<lang go>package main

import (

   "fmt"
   "unicode/utf8"

)

func utf8encode(codepoint rune) []byte {

   buffer := make([]byte, 4)
   length := utf8.EncodeRune(buffer, codepoint)
   return buffer[:length]

}

func utf8decode(bytes []byte) rune {

   result, _ := utf8.DecodeRune(bytes)
   return result

}

func main() {

       fmt.Printf("%-7s %7s\t%s\t%s\n", "Char", "Unicode", "UTF-8 encoded", "Decoded");
   for _, codepoint := range []rune{'A', 'ö', 'Ж', '€', '𝄞'} {
       encoded := utf8encode(codepoint)
       decoded := utf8decode(encoded)
       fmt.Printf("%-7c U+%04X\t%-12X\t%c\n", codepoint, codepoint, encoded, decoded)
   }

}</lang>

Output:

Char    Unicode	UTF-8 encoded	Decoded
A       U+0041	41          	A
ö       U+00F6	C3B6        	ö
Ж       U+0416	D096        	Ж
€       U+20AC	E282AC      	€
𝄞       U+1D11E	F09D849E    	𝄞

Alternately: <lang go>package main

import (

   "fmt"

)

func utf8encode(codepoint rune) []byte {

   return []byte(string([]rune{codepoint}))

}

func utf8decode(bytes []byte) rune {

   return []rune(string(bytes))[0]

}

func main() {

       fmt.Printf("%-7s %7s\t%s\t%s\n", "Char", "Unicode", "UTF-8 encoded", "Decoded");
   for _, codepoint := range []rune{'A', 'ö', 'Ж', '€', '𝄞'} {
       encoded := utf8encode(codepoint)
       decoded := utf8decode(encoded)
       fmt.Printf("%-7c U+%04X\t%-12X\t%c\n", codepoint, codepoint, encoded, decoded)
   }

}</lang>

Output:

Char    Unicode	UTF-8 encoded	Decoded
A       U+0041	41          	A
ö       U+00F6	C3B6        	ö
Ж       U+0416	D096        	Ж
€       U+20AC	E282AC      	€
𝄞       U+1D11E	F09D849E    	𝄞

Haskell

Example makes use of bytestring and text packages:

<lang haskell>module Main (main) where

import qualified Data.ByteString as ByteString (pack, unpack) import Data.Char (chr, ord) import Data.Foldable (for_) import Data.List (intercalate) import qualified Data.Text as Text (head, singleton) import qualified Data.Text.Encoding as Text (decodeUtf8, encodeUtf8) import Text.Printf (printf)

encodeCodepoint :: Int -> [Int] encodeCodepoint = map fromIntegral . ByteString.unpack . Text.encodeUtf8 . Text.singleton . chr

decodeToCodepoint :: [Int] -> Int decodeToCodepoint = ord . Text.head . Text.decodeUtf8 . ByteString.pack . map fromIntegral

main :: IO () main = do

   putStrLn "Character  Unicode  UTF-8 encoding (hex)  Decoded"
   putStrLn "-------------------------------------------------"
   for_ [0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E] $ \codepoint -> do
       let values = encodeCodepoint codepoint
           codepoint' = decodeToCodepoint values
       putStrLn $ printf "%c          %-7s  %-20s  %c"
           codepoint
           (printf "U+%04X" codepoint :: String)
           (intercalate " " (map (printf "%02X") values))
           codepoint'</lang>

Output:

Character  Unicode  UTF-8 encoding (hex)  Decoded
-------------------------------------------------
A          U+0041   41                    A
ö          U+00F6   C3 B6                 ö
Ж          U+0416   D0 96                 Ж
€          U+20AC   E2 82 AC              €
𝄞          U+1D11E  F0 9D 84 9E           𝄞

J

Solution: <lang j>utf8=: 8&u: NB. converts to UTF-8 from unicode or unicode codepoint integer ucp=: 9&u: NB. converts to unicode from UTF-8 or unicode codepoint integer ucp_hex=: hfd@(3 u: ucp) NB. converts to unicode codepoint hexadecimal from UTF-8, unicode or unicode codepoint integer</lang>

Examples: <lang j> utf8 65 246 1046 8364 119070 AöЖ€𝄞

  ucp 65 246 1046 8364 119070

AöЖ€𝄞

  ucp 'AöЖ€𝄞'

AöЖ€𝄞

  utf8 ucp 65 246 1046 8364 119070

AöЖ€𝄞

  ucp_hex utf8 65 246 1046 8364 119070

00041 000f6 00416 020ac 1d11e

  utf8@dfh ucp_hex utf8 65 246 1046 8364 119070

AöЖ€𝄞</lang>

Java

Works with: Java version 7+

<lang java>import java.nio.charset.StandardCharsets; import java.util.Formatter;

public class UTF8EncodeDecode {

   public static byte[] utf8encode(int codepoint) {
       return new String(new int[]{codepoint}, 0, 1).getBytes(StandardCharsets.UTF_8);
   }

   public static int utf8decode(byte[] bytes) {
       return new String(bytes, StandardCharsets.UTF_8).codePointAt(0);
   }

   public static void main(String[] args) {
       System.out.printf("%-7s %-43s %7s\t%s\t%7s%n",
               "Char", "Name", "Unicode", "UTF-8 encoded", "Decoded");

       for (int codepoint : new int[]{0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E}) {
           byte[] encoded = utf8encode(codepoint);
           Formatter formatter = new Formatter();
           for (byte b : encoded) {
               formatter.format("%02X ", b);
           }
           String encodedHex = formatter.toString();
           int decoded = utf8decode(encoded);
           System.out.printf("%-7c %-43s U+%04X\t%-12s\tU+%04X%n",
                   codepoint, Character.getName(codepoint), codepoint, encodedHex, decoded);
       }
   }

}</lang>

Output:

Char    Name                                        Unicode	UTF-8 encoded	Decoded
A       LATIN CAPITAL LETTER A                      U+0041	41          	A
ö       LATIN SMALL LETTER O WITH DIAERESIS         U+00F6	C3 B6       	ö
Ж       CYRILLIC CAPITAL LETTER ZHE                 U+0416	D0 96       	Ж
€       EURO SIGN                                   U+20AC	E2 82 AC    	€
𝄞      MUSICAL SYMBOL G CLEF                       U+1D11E	F0 9D 84 9E 	𝄞

JavaScript

An implementation in ECMAScript 2015 (ES6): <lang javascript> /***************************************************************************\ |* Pure UTF-8 handling without detailed error reporting functionality. *| |***************************************************************************| |* utf8encode *| |* < String character or UInt32 code point *| |* > Uint8Array encoded_character *| |* | ErrorString *| |* *| |* utf8encode takes a string or uint32 representing a single code point *| |* as its argument and returns an array of length 1 up to 4 containing *| |* utf8 code units representing that character. *| |***************************************************************************| |* utf8decode *| |* < Unit8Array [highendbyte highmidendbyte lowmidendbyte lowendbyte] *| |* > uint32 character *| |* | ErrorString *| |* *| |* utf8decode takes an array of one to four uint8 representing utf8 code *| |* units and returns a uint32 representing that code point. *| \***************************************************************************/

const

 utf8encode=
   n=>
     (m=>
       m<0x80
      ?Uint8Array.from(
         [ m>>0&0x7f|0x00])
      :m<0x800
      ?Uint8Array.from(
         [ m>>6&0x1f|0xc0,m>>0&0x3f|0x80])
      :m<0x10000
      ?Uint8Array.from(
         [ m>>12&0x0f|0xe0,m>>6&0x3f|0x80,m>>0&0x3f|0x80])
      :m<0x110000
      ?Uint8Array.from(
         [ m>>18&0x07|0xf0,m>>12&0x3f|0x80,m>>6&0x3f|0x80,m>>0&0x3f|0x80])
      :(()=>{throw'Invalid Unicode Code Point!'})())
     ( typeof n==='string'
      ?n.codePointAt(0)
      :n&0x1fffff),
 utf8decode=
   ([m,n,o,p])=>
     m<0x80
    ?( m&0x7f)<<0
    :0xc1<m&&m<0xe0&&n===(n&0xbf)
    ?( m&0x1f)<<6|( n&0x3f)<<0
    :( m===0xe0&&0x9f<n&&n<0xc0
     ||0xe0<m&&m<0xed&&0x7f<n&&n<0xc0
     ||m===0xed&&0x7f<n&&n<0xa0
     ||0xed<m&&m<0xf0&&0x7f<n&&n<0xc0)
   &&o===o&0xbf
    ?( m&0x0f)<<12|( n&0x3f)<<6|( o&0x3f)<<0
    :( m===0xf0&&0x8f<n&&n<0xc0
     ||m===0xf4&&0x7f<n&&n<0x90
     ||0xf0<m&&m<0xf4&&0x7f<n&&n<0xc0)
   &&o===o&0xbf&&p===p&0xbf
    ?( m&0x07)<<18|( n&0x3f)<<12|( o&0x3f)<<6|( p&0x3f)<<0
    :(()=>{throw'Invalid UTF-8 encoding!'})()

</lang> The testing inputs: <lang javascript> const

 str=
   'AöЖ€𝄞'
,cps=
   Uint32Array.from(str,s=>s.codePointAt(0))
,cus=
   [ [ 0x41]
    ,[ 0xc3,0xb6]
    ,[ 0xd0,0x96]
    ,[ 0xe2,0x82,0xac]
    ,[ 0xf0,0x9d,0x84,0x9e]]
  .map(a=>Uint8Array.from(a))
,zip3=
   ([a,...as],[b,...bs],[c,...cs])=>
     0<as.length+bs.length+cs.length
    ?[ [ a,b,c],...zip3(as,bs,cs)]
    :[ [ a,b,c]]
,inputs=zip3(str,cps,cus);

</lang> The testing code: <lang javascript> console.log(`\ ${'Character'.padEnd(16)}\ ${'CodePoint'.padEnd(16)}\ ${'CodeUnits'.padEnd(16)}\ ${'uft8encode(ch)'.padEnd(16)}\ ${'uft8encode(cp)'.padEnd(16)}\ utf8decode(cu)`) for(let [ch,cp,cu] of inputs)

 console.log(`\

${ch.padEnd(16)}\ ${cp.toString(0x10).padStart(8,'U+000000').padEnd(16)}\ ${`[${[...cu].map(n=>n.toString(0x10))}]`.padEnd(16)}\ ${`[${[...utf8encode(ch)].map(n=>n.toString(0x10))}]`.padEnd(16)}\ ${`[${[...utf8encode(cp)].map(n=>n.toString(0x10))}]`.padEnd(16)}\ ${utf8decode(cu).toString(0x10).padStart(8,'U+000000')}`) </lang> and finally, the output from the test:

Character       CodePoint       CodeUnits       uft8encode(ch)  uft8encode(cp)  utf8decode(cu)
A               U+000041        [41]            [41]            [41]            U+000041
ö               U+0000f6        [c3,b6]         [c3,b6]         [c3,b6]         U+0000f6
Ж               U+000416        [d0,96]         [d0,96]         [d0,96]         U+000416
€               U+0020ac        [e2,82,ac]      [e2,82,ac]      [e2,82,ac]      U+0020ac
𝄞              U+01d11e        [f0,9d,84,9e]   [f0,9d,84,9e]   [f0,9d,84,9e]   U+01d11e

Note that the misalign there on the last line is caused by the string length of astral characters being 2 so the padding functions break.

Julia

Works with: Julia version 0.6

Julia supports by default UTF-8 encoding.

<lang julia>for t in ("A", "ö", "Ж", "€", "𝄞")

   enc = Vector{UInt8}(t)
   dec = String(enc)
   println(dec, " → ", enc)

end</lang>

Output:

A → UInt8[0x41]
ö → UInt8[0xc3, 0xb6]
Ж → UInt8[0xd0, 0x96]
€ → UInt8[0xe2, 0x82, 0xac]
𝄞 → UInt8[0xf0, 0x9d, 0x84, 0x9e]

Kotlin

<lang scala>// version 1.1.2

fun utf8Encode(codePoint: Int) = String(intArrayOf(codePoint), 0, 1).toByteArray(Charsets.UTF_8)

fun utf8Decode(bytes: ByteArray) = String(bytes, Charsets.UTF_8).codePointAt(0)

fun main(args: Array<String>) {

   val codePoints = intArrayOf(0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E)
   println("Char  Name                                 Unicode  UTF-8         Decoded")
   for (codePoint in codePoints) {
       var n = if(codePoint <= 0xFFFF) 4 else 5 
       System.out.printf("%-${n}c  %-35s  U+%05X  ", codePoint, Character.getName(codePoint), codePoint)  
       val bytes = utf8Encode(codePoint)
       var s = ""
       for (byte in bytes) s += "%02X ".format(byte)
       val decoded = utf8Decode(bytes)
       n = if(decoded.toInt() <= 0xFFFF) 12 else 11 
       System.out.printf("%-${n}s  %c\n", s, decoded)  
   }

}</lang>

Output:

Char  Name                                 Unicode  UTF-8         Decoded
A     LATIN CAPITAL LETTER A               U+00041  41            A
ö     LATIN SMALL LETTER O WITH DIAERESIS  U+000F6  C3 B6         ö
Ж     CYRILLIC CAPITAL LETTER ZHE          U+00416  D0 96         Ж
€     EURO SIGN                            U+020AC  E2 82 AC      €
𝄞     MUSICAL SYMBOL G CLEF                U+1D11E  F0 9D 84 9E   𝄞

Lingo

Since UTF-8 is Lingo's native string encoding, and UTF-8 strings can be read into byteArrays (and v.v.), such UTF-8 encoding and decoding is built-in.
Relevant Lingo functions are:
- charToNum (string): converts single-character string to unicode code point (int)
- numToChar (int): converts unicode code point (int) to single-character string
- byteArray (string): creates byte array of UTF-8 bytes for string
- byteArray.toHexString (intStart, intLen): returns hex string representation of byte array (e.g. for printing)
- byteArray.readRawString (intLen, [strCharSet="UTF-8"]): reads a fixed number of bytes as a string <lang Lingo>chars = ["A", "ö", "Ж", "€", "𝄞"] put "Character Unicode (int) UTF-8 (hex) Decoded" repeat with c in chars

   ba = bytearray(c)
   put col(c, 12) & col(charToNum(c), 16) & col(ba.toHexString(1, ba.length), 14) & ba.readRawString(ba.length)

end repeat</lang> Helper function for table formatting <lang Lingo>on col (val, len)

   str = string(val)
   repeat with i = str.length+1 to len
       put " " after str
   end repeat
   return str

end</lang>

Output:

Character   Unicode (int)   UTF-8 (hex)   Decoded
A           65              41            A
ö           246             c3 b6         ö
Ж           1046            d0 96         Ж
€           8364            e2 82 ac      €
𝄞           119070          f0 9d 84 9e   𝄞

Mathematica

<lang Mathematica>utf = ToCharacterCode[ToString["AöЖ€", CharacterEncoding -> "UTF8"]] ToCharacterCode[FromCharacterCode[utf, "UTF8"]]</lang>

Output:

{65, 195, 182, 208, 150, 226, 130, 172}
{65, 246, 1046, 8364}

Perl

<lang perl>#!/usr/bin/perl use strict; use warnings; use Unicode::UCD 'charinfo'; # getting the unicode name of the character use utf8; # using non-ascii-characters in source code binmode STDOUT, ":encoding(UTF-8)"; # printing non-ascii-characters to screen

my @chars = map {ord} qw/A ö Ж € 𝄞/; # @chars contains the unicode points my $print_format = '%5s %-35s'; printf "$print_format %8s %s\n" , 'char', 'name', 'unicode', 'utf-8 encoding'; map{ my $name = charinfo($_)->{'name'}; # get unicode name printf "$print_format %06x " , chr, lc $name, $_; my $utf8 = chr; # single char (using implicit $_) utf8::encode($utf8); # inplace encoding into utf8 parts map{ # for each utf8 char print ord printf " %x", ord; } split //, $utf8; print "\n"; } @chars;</lang>

Output:

 char  name                                 unicode  utf-8 encoding
    A  latin capital letter a               000041   41
    ö  latin small letter o with diaeresis  0000f6   c3 b6
    Ж  cyrillic capital letter zhe          000416   d0 96
    €  euro sign                            0020ac   e2 82 ac
    𝄞  musical symbol g clef                01d11e   f0 9d 84 9e

Perl 6

Works with: Rakudo version 2017.02

Pretty much all built in to the language. <lang perl6>say sprintf("%-18s %-36s|%8s| %7s |%14s | %s\n", 'Character|', 'Name', 'Ordinal', 'Unicode', 'UTF-8 encoded', 'decoded'), '-' x 100;

for < A ö Ж € 𝄞 😜 👨‍👩‍👧‍👦> -> $char {

   printf "   %-5s | %-43s | %6s | %-7s | %12s  |%4s\n", $char, $char.uninames.join(','), $char.ords.join(' '),
     ('U+' X~ $char.ords».base(16)).join(' '), $char.encode('UTF8').list».base(16).Str, $char.encode('UTF8').decode;

}</lang>

Output:

Character|         Name                                | Ordinal| Unicode | UTF-8 encoded | decoded
----------------------------------------------------------------------------------------------------
   A     | LATIN CAPITAL LETTER A                      |     65 | U+41    |           41  |   A
   ö     | LATIN SMALL LETTER O WITH DIAERESIS         |    246 | U+F6    |        C3 B6  |   ö
   Ж    | CYRILLIC CAPITAL LETTER ZHE                 |   1046 | U+416   |        D0 96  |   Ж
   €     | EURO SIGN                                   |   8364 | U+20AC  |     E2 82 AC  |   €
   𝄞     | MUSICAL SYMBOL G CLEF                       | 119070 | U+1D11E |  F0 9D 84 9E  |   𝄞
   😜    | FACE WITH STUCK-OUT TONGUE AND WINKING EYE  | 128540 | U+1F61C |  F0 9F 98 9C  |   😜
   👨‍👩‍👧‍👦    | MAN,ZERO WIDTH JOINER,WOMAN,ZERO WIDTH JOINER,GIRL,ZERO WIDTH JOINER,BOY | 128104 8205 128105 8205 128103 8205 128102 | U+1F468 U+200D U+1F469 U+200D U+1F467 U+200D U+1F466 | F0 9F 91 A8 E2 80 8D F0 9F 91 A9 E2 80 8D F0 9F 91 A7 E2 80 8D F0 9F 91 A6  |   👨‍👩‍👧‍👦

Phix

Standard autoinclude, see the manual and/or builtins/utfconv.e ( http://phix.x10.mx/docs/html/utfconv.htm and/or https://bitbucket.org/petelomax/phix/src )
As requested in the task description: <lang Phix>constant tests = {#0041, #00F6, #0416, #20AC, #1D11E}

function hex(sequence s, string fmt) -- output helper

   for i=1 to length(s) do
       s[i] = sprintf(fmt,s[i])
   end for
   return join(s,',')

end function

for i=1 to length(tests) do

   integer codepoint = tests[i]
   sequence s = utf32_to_utf8({codepoint}),
            r = utf8_to_utf32(s)
   printf(1,"#%04x -> {%s} -> {%s}\n",{codepoint, hex(s,"#%02x"),hex(r,"#%04x")})

end for</lang>

Output:

#0041 -> {#41} -> {#0041}
#00F6 -> {#C3,#B6} -> {#00F6}
#0416 -> {#D0,#96} -> {#0416}
#20AC -> {#E2,#82,#AC} -> {#20AC}
#1D11E -> {#F0,#9D,#84,#9E} -> {#1D11E}

Python

!/usr/bin/env python3

from unicodedata import name

def unicode_code(ch):

   return 'U+{:04x}'.format(ord(ch))

def utf8hex(ch):

   return " ".join([hex(c)[2:] for c in ch.encode('utf8')]).upper()

if __name__ == "__main__":

   print('{:<11} {:<36} {:<15} {:<15}'.format('Character', 'Name', 'Unicode', 'UTF-8 encoding (hex)'))
   chars = ['A', 'ö', 'Ж', '€', '𝄞']
   for char in chars:
       print('{:<11} {:<36} {:<15} {:<15}'.format(char, name(char), unicode_code(char), utf8hex(char)))</lang>

Output:

Character   Name                                 Unicode         UTF-8 encoding (hex)
A           LATIN CAPITAL LETTER A               U+0041          41             
ö           LATIN SMALL LETTER O WITH DIAERESIS  U+00f6          C3 B6          
Ж           CYRILLIC CAPITAL LETTER ZHE          U+0416          D0 96          
€           EURO SIGN                            U+20ac          E2 82 AC       
𝄞           MUSICAL SYMBOL G CLEF                U+1d11e         F0 9D 84 9E

Racket

<lang racket>#lang racket

(define char-map

 '((LATIN-CAPITAL-LETTER-A              .  #\U0041)
   (LATIN-SMALL-LETTER-O-WITH-DIAERESIS .  #\U00F6)
   (CYRILLIC-CAPITAL-LETTER-ZHE         .  #\U0416)
   (EURO-SIGN                           .  #\U20AC)
   (MUSICAL-SYMBOL-G-CLEF               .  #\U1D11E)))

(for ((name.char (in-list char-map)))

 (define name (car name.char))
 (define chr (cdr name.char))
 (let ((bites (bytes->list (string->bytes/utf-8 (list->string (list chr))))))
   (printf "~s\t~a\t~a\t~a\t~a~%" chr chr
           (map (curryr number->string 16) bites)
           (bytes->string/utf-8 (list->bytes bites))
           name)))</lang>

Output:

#\A	A	(41)	A	LATIN-CAPITAL-LETTER-A
#\ö	ö	(c3 b6)	ö	LATIN-SMALL-LETTER-O-WITH-DIAERESIS
#\Ж	Ж	(d0 96)	Ж	CYRILLIC-CAPITAL-LETTER-ZHE
#\€	€	(e2 82 ac)	€	EURO-SIGN
#\𝄞	𝄞	(f0 9d 84 9e)	𝄞	MUSICAL-SYMBOL-G-CLEF

Scala

Imperative solution

<lang scala>object UTF8EncodeAndDecode extends App {

 val codePoints = Seq(0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E)

 def utf8Encode(codepoint: Int): Array[Byte] =
   new String(Array[Int](codepoint), 0, 1).getBytes(StandardCharsets.UTF_8)

 def utf8Decode(bytes: Array[Byte]): Int =
   new String(bytes, StandardCharsets.UTF_8).codePointAt(0)

 println("Char Name                                 Unicode  UTF-8       Decoded")
 for (codePoint <- codePoints) {
   val w = if (Character.isBmpCodePoint(codePoint)) 4 else 5 // Compute spacing
   val bytes = utf8Encode(codePoint)

   def leftAlignedHex = f"U+${codePoint}%04X"

   val s = new StringBuilder()
   bytes.foreach(byte => s ++= "%02X ".format(byte))

   printf(s"%-${w}c %-36s %-7s  %-${16 - w}s%c%n",
     codePoint, Character.getName(codePoint), leftAlignedHex, s, utf8Decode(bytes))
 }</lang>

Functional solution

<lang scala>import java.nio.charset.StandardCharsets

object UTF8EncodeAndDecode extends App {

 val codePoints = Seq(0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E)

 def utf8Encode(codepoint: Int): Array[Byte] =
   new String(Array[Int](codepoint), 0, 1).getBytes(StandardCharsets.UTF_8)

 def utf8Decode(bytes: Array[Byte]): Int =
   new String(bytes, StandardCharsets.UTF_8).codePointAt(0)

 println("Char Name                                 Unicode  UTF-8       Decoded")
 codePoints.foreach{ codePoint =>
   val w = if (Character.isBmpCodePoint(codePoint)) 4 else 5 // Compute spacing
   val bytes = utf8Encode(codePoint)

   def leftAlignedHex: String = f"U+${codePoint}%04X"

   def utf: String = bytes.foldLeft("")(_ + "%02X ".format(_))

   printf(s"%-${w}c %-36s %-7s  %-${16 - w}s%c%n",
     codePoint, Character.getName(codePoint), leftAlignedHex, utf, utf8Decode(bytes))  }

 println(s"\nSuccessfully completed without errors. [total ${scala.compat.Platform.currentTime - executionStart} ms]")

}</lang>

Composable and testable solution

<lang scala>package example

object UTF8EncodeAndDecode extends TheMeat with App {

 val codePoints = Seq(0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E)

 println("Char Name                                 Unicode  UTF-8       Decoded")
 codePoints.foreach { codepoint => print(composeString(codepoint)) }

 println(s"\nSuccessfully completed without errors. [total ${scala.compat.Platform.currentTime - executionStart} ms]")

}

trait TheMeat {

 import java.nio.charset.StandardCharsets

 def composeString(codePoint: Int): String = {
   val w = if (Character.isBmpCodePoint(codePoint)) 4 else 5 // Compute spacing
   val bytes = utf8Encode(codePoint)

   def leftAlignedHex: String = f"U+${codePoint}%04X"

   def utf: String = bytes.foldLeft("")(_ + "%02X ".format(_))

   s"%-${w}c %-36s %-7s  %-${16 - w}s%c%n"
     .format(codePoint, Character.getName(codePoint), leftAlignedHex, utf, utf8Decode(bytes))
 }

 def utf8Encode(codepoint: Int): Array[Byte] =
   new String(Array[Int](codepoint), 0, 1).getBytes(StandardCharsets.UTF_8)

 def utf8Decode(bytes: Array[Byte]): Int =
   new String(bytes, StandardCharsets.UTF_8).codePointAt(0)

} </lang>

Sidef

<lang ruby>func utf8_encoder(Number code) {

   code.chr.encode('UTF-8').bytes.map{.chr}

}

func utf8_decoder(Array bytes) {

   bytes.map{.ord}.decode('UTF-8')

}

for n in ([0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E]) {

   var encoded = utf8_encoder(n)
   var decoded = utf8_decoder(encoded)
   assert_eq(n, decoded.ord)
   say "#{decoded} -> #{encoded}"

}</lang>

Output:

A -> ["A"]
ö -> ["\xC3", "\xB6"]
Ж -> ["\xD0", "\x96"]
€ -> ["\xE2", "\x82", "\xAC"]
𝄞 -> ["\xF0", "\x9D", "\x84", "\x9E"]

Tcl

Note: Tcl can handle Unicodes only up to U+FFFD, i.e. the Basic Multilingual Plane (BMP, 16 bits wide). Therefore, the fifth test fails as expected. <lang Tcl>proc encoder int {

  set u [format %c $int]
  set bytes {}
  foreach byte [split [encoding convertto utf-8 $u] ""] {
     lappend bytes [format %02X [scan $byte %c]]
  }
  return $bytes

} proc decoder bytes {

  set str {}
  foreach byte $bytes {
     append str [format %c [scan $byte %x]]
  }
  return [encoding convertfrom utf-8 $str]

} foreach test {0x0041 0x00f6 0x0416 0x20ac 0x1d11e} {

  set res $test
  lappend res [encoder $test] -> [decoder [encoder $test]]
  puts $res

}</lang>

0x0041 41 -> A
0x00f6 {C3 B6} -> ö
0x0416 {D0 96} -> Ж
0x20ac {E2 82 AC} -> €
0x1d11e {EF BF BD} -> �

Alternative Implementation

While perhaps not as readable as the above, this version handles beyond-BMP codepoints by manually composing the utf-8 byte sequences and emitting raw bytes to the console. encoding convertto utf-8 command still does the heavy lifting where it can.

<lang Tcl>proc utf8 {codepoint} {

   scan $codepoint %llx cp
   if {$cp < 0x10000} {
       set str [subst \\u$codepoint]               ;# substitute per Tcl backslash rule
       set bytes [encoding convertto utf-8 $str]   ;# encode
   } else {                                        ;# codepoints beyond the BMP need manual approach
       set bits [format %021b $cp]                 ;# format as binary string
       set unibits    11110[string range $bits 0 2];# insert extra bits for utf-8 4-byte encoding
       append unibits 10[string range $bits 3 8]
       append unibits 10[string range $bits 9 14]
       append unibits 10[string range $bits 15 20]
       set bytes [binary format B* $unibits]       ;# turn into a sequence of bytes
   }
   return $bytes

}

proc hexchars {s} {

   binary scan $s H* hex
   regsub -all .. $hex {\0 }

}

for the test, we assume the tty is in utf-8 mode and can handle beyond-BMP chars
so set output mode to binary so we can write raw bytes!

chan configure stdout -encoding binary foreach codepoint { 41 F6 416 20AC 1D11E } {

   set utf8 [utf8 $codepoint]
   puts "[format U+%04s $codepoint]\t$utf8\t[hexchars $utf8]"

}</lang>

Output:

U+0041 A 41

U+00F6 ö c3 b6 U+0416 Ж d0 96 U+20AC € e2 82 ac U+1D11E 𝄞 f0 9d 84 9e

zkl

<lang zkl>println("Char Unicode UTF-8"); foreach utf,unicode_int in (T( T("\U41;",0x41), T("\Uf6;",0xf6),

     T("\U416;",0x416), T("\U20AC;",0x20ac), T("\U1D11E;",0x1d11e))){
  utf_int:=utf.reduce(fcn(s,c){ 0x100*s + c.toAsc() },0);
  char :=unicode_int.toString(-8);	// Unicode int to UTF-8 string
  // UTF-8 bytes to UTF-8 string:
  char2:=Data(Void,utf_int.toBigEndian(utf_int.len())).text;

  println("%s %s %9s  %x".fmt(char,char2,"U+%x".fmt(unicode_int),utf_int));

}</lang> Int.len() --> number of bytes in int. This could be hard coded because UTF-8 has a max of 6 bytes and (0x41).toBigEndian(6) --> 0x41,0,0,0,0,0 which is a zero terminated string ("A");

Output:

Char  Unicode  UTF-8
A A      U+41  41
ö ö      U+f6  c3b6
Ж Ж     U+416  d096
€ €    U+20ac  e282ac
𝄞 𝄞   U+1d11e  f09d849e