UTF-8 encode and decode: Difference between revisions

From Rosetta Code
Content added Content deleted
Line 556: Line 556:


<lang haskell>module Main (main) where
<lang haskell>module Main (main) where

import qualified Data.ByteString as ByteString (unpack)
import qualified Data.ByteString as ByteString (pack, unpack)
import Data.Char (chr)
import Data.Char (chr, ord)
import Data.Foldable (for_)
import Data.Foldable (for_)
import Data.List (intercalate)
import Data.List (intercalate)
import qualified Data.Text as Text (singleton)
import qualified Data.Text as Text (head, singleton)
import qualified Data.Text.Encoding as Text (encodeUtf8)
import qualified Data.Text.Encoding as Text (decodeUtf8, encodeUtf8)
import Text.Printf (printf)
import Text.Printf (printf)

encodeCodepoint :: Int -> [Int]
encodeCodepoint :: Int -> [Int]
encodeCodepoint = map fromIntegral . ByteString.unpack . Text.encodeUtf8 . Text.singleton . chr
encodeCodepoint = map fromIntegral . ByteString.unpack . Text.encodeUtf8 . Text.singleton . chr

decodeToCodepoint :: [Int] -> Int
decodeToCodepoint = ord . Text.head . Text.decodeUtf8 . ByteString.pack . map fromIntegral


main :: IO ()
main :: IO ()
main = do
main = do
putStrLn "Character Unicode UTF-8 encoding (hex)"
putStrLn "Character Unicode UTF-8 encoding (hex) Decoded"
putStrLn "----------------------------------------"
putStrLn "-------------------------------------------------"
for_ [0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E] $ \codepoint -> do
for_ [0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E] $ \codepoint -> do
let values = encodeCodepoint codepoint
let values = encodeCodepoint codepoint
codepoint' = decodeToCodepoint values
putStrLn $ printf "%c %-7s %s"
putStrLn $ printf "%c %-7s %-20s %c"
codepoint
codepoint
(printf "U+%04X" codepoint :: String)
(printf "U+%04X" codepoint :: String)
(intercalate " " (map (printf "%02X") values))</lang>
(intercalate " " (map (printf "%02X") values))
codepoint'</lang>
{{out}}
{{out}}
<pre>
<pre>
Character Unicode UTF-8 encoding (hex)
Character Unicode UTF-8 encoding (hex) Decoded
-----------------------------------------
-------------------------------------------------
A U+0041 41
A U+0041 41 A
ö U+00F6 C3 B6
ö U+00F6 C3 B6 ö
Ж U+0416 D0 96
Ж U+0416 D0 96 Ж
€ U+20AC E2 82 AC
€ U+20AC E2 82 AC
𝄞 U+1D11E F0 9D 84 9E
𝄞 U+1D11E F0 9D 84 9E 𝄞
</pre>
</pre>



Revision as of 21:39, 14 April 2018

Task
UTF-8 encode and decode
You are encouraged to solve this task according to the task description, using any language you may know.

As described in UTF-8 and in Wikipedia, UTF-8 is a popular encoding of (multi-byte) Unicode code-points into eight-bit octets.

The goal of this task is to write a encoder that takes a unicode code-point (an integer representing a unicode character) and returns a sequence of 1-4 bytes representing that character in the UTF-8 encoding.

Then you have to write the corresponding decoder that takes a sequence of 1-4 UTF-8 encoded bytes and return the corresponding unicode character.

Demonstrate the functionality of your encoder and decoder on the following five characters:

Character   Name                                  Unicode    UTF-8 encoding (hex)
---------------------------------------------------------------------------------
A           LATIN CAPITAL LETTER A                U+0041     41
ö           LATIN SMALL LETTER O WITH DIAERESIS   U+00F6     C3 B6
Ж           CYRILLIC CAPITAL LETTER ZHE           U+0416     D0 96
€           EURO SIGN                             U+20AC     E2 82 AC
𝄞           MUSICAL SYMBOL G CLEF                 U+1D11E    F0 9D 84 9E

Provided below is a reference implementation in Common Lisp.

Common Lisp

Helper functions

<lang lisp> (defun ascii-byte-p (octet)

 "Return t if octet is a single-byte 7-bit ASCII char.
 The most significant bit is 0, so the allowed pattern is 0xxx xxxx."
 (assert (typep octet 'integer))
 (assert (<= (integer-length octet) 8))
 (let ((bitmask  #b10000000)
       (template #b00000000))
   ;; bitwise and the with the bitmask #b11000000 to extract the first two bits.
   ;; check if the first two bits are equal to the template #b10000000.
   (= (logand bitmask octet) template)))

(defun multi-byte-p (octet)

 "Return t if octet is a part of a multi-byte UTF-8 sequence.
 The multibyte pattern is 1xxx xxxx. A multi-byte can be either a lead byte or a trail byte."
 (assert (typep octet 'integer))
 (assert (<= (integer-length octet) 8))
 (let ((bitmask  #b10000000)
       (template #b10000000))
   ;; bitwise and the with the bitmask #b11000000 to extract the first two bits.
   ;; check if the first two bits are equal to the template #b10000000.
   (= (logand bitmask octet) template)))

(defun lead-byte-p (octet)

 "Return t if octet is one of the leading bytes of an UTF-8 sequence, nil otherwise.
 Allowed leading byte patterns are 0xxx xxxx, 110x xxxx, 1110 xxxx and 1111 0xxx."
 (assert (typep octet 'integer))
 (assert (<= (integer-length octet) 8))
 (let ((bitmasks  (list #b10000000 #b11100000 #b11110000 #b11111000))
       (templates (list #b00000000 #b11000000 #b11100000 #b11110000)))
   (some #'(lambda (a b) (= (logand a octet) b)) bitmasks templates)))

(defun n-trail-bytes (octet)

 "Take a leading utf-8 byte, return the number of continuation bytes 1-3."
 (assert (typep octet 'integer))
 (assert (<= (integer-length octet) 8))
 (let ((bitmasks  (list #b10000000 #b11100000 #b11110000 #b11111000))
       (templates (list #b00000000 #b11000000 #b11100000 #b11110000)))
   (loop for i from 0 to 3
      when (= (nth i templates) (logand (nth i bitmasks) octet))
      return i)))

</lang>

Encoder

<lang lisp> (defun unicode-to-utf-8 (int)

 "Take a unicode code point, return a list of one to four UTF-8 encoded bytes (octets)."
 (assert (<= (integer-length int) 21))
 (let ((n-trail-bytes (cond ((<= #x00000 int #x00007F) 0)
                            ((<= #x00080 int #x0007FF) 1)
                            ((<= #x00800 int #x00FFFF) 2)
                            ((<= #x10000 int #x10FFFF) 3)))
       (lead-templates (list #b00000000 #b11000000 #b11100000 #b11110000))
       (trail-template #b10000000)
       ;; number of content bits in the lead byte.
       (n-lead-bits (list 7 5 4 3))
       ;; number of content bits in the trail byte.
       (n-trail-bits 6)
       ;; list to put the UTF-8 encoded bytes in.
       (byte-list nil))
   (if (= n-trail-bytes 0)
       ;; if we need 0 trail bytes, ist just an ascii single byte.
       (push int byte-list)
       (progn
         ;; if we need more than one byte, first fill the trail bytes with 6 bits each.
         (loop for i from 0 to (1- n-trail-bytes)
            do (push (+ trail-template
                        (ldb (byte n-trail-bits (* i n-trail-bits)) int))
                     byte-list))
         ;; then copy the remaining content bytes to the lead byte.
         (push (+ (nth n-trail-bytes lead-templates)
                  (ldb (byte (nth n-trail-bytes n-lead-bits) (* n-trail-bytes n-trail-bits)) int))
               byte-list)))
   ;; return the list of UTF-8 encoded bytes.
   byte-list))

</lang>

Decoder

<lang lisp> (defun utf-8-to-unicode (byte-list)

 "Take a list of one to four utf-8 encoded bytes (octets), return a code point."
 (let ((b1 (car byte-list)))
   (cond ((ascii-byte-p b1) b1) ; if a single byte, just return it.
         ((multi-byte-p b1)
          (if (lead-byte-p b1)
              (let ((n (n-trail-bytes b1))
                    ;; Content bits we want to extract from each lead byte.
                    (lead-templates (list #b01111111 #b00011111 #b00001111 #b00000111))
                    ;; Content bits we want to extract from each trail byte.
                    (trail-template #b00111111))
                (if (= n (1- (list-length byte-list)))
                    ;; add lead byte
                    (+ (ash (logand (nth 0 byte-list) (nth n lead-templates)) (* 6 n))
                       ;; and the trail bytes
                       (loop for i from 1 to n sum
                            (ash (logand (nth i byte-list) trail-template) (* 6 (- n i)))))
                    (error "calculated number of bytes doesnt match the length of the byte list")))
              (error "first byte in the list isnt a lead byte"))))))

</lang>

The test

<lang lisp> (defun test-utf-8 ()

 "Return t if the chosen unicode points are encoded and decoded correctly."
 (let* ((unicodes-orig (list 65 246 1046 8364 119070))
        (unicodes-test (mapcar #'(lambda (x) (utf-8-to-unicode (unicode-to-utf-8 x)))
                               unicodes-orig)))
   (mapcar #'(lambda (x)
               (format t
                       "character ~A, code point: ~6x, utf-8: ~{~x ~}~%"
                       (code-char x)
                       x
                       (unicode-to-utf-8 x)))
           unicodes-orig)
   ;; return t if all are t
   (every #'= unicodes-orig unicodes-test)))

</lang>

Test output

<lang lisp> CL-USER> (test-utf-8) character A, code point: 41, utf-8: 41 character ö, code point: F6, utf-8: C3 B6 character Ж, code point: 416, utf-8: D0 96 character €, code point: 20AC, utf-8: E2 82 AC character 𝄞, code point: 1D11E, utf-8: F0 9D 84 9E T </lang>

C

<lang C>

  1. include <stdio.h>
  2. include <stdlib.h>
  3. include <inttypes.h>

typedef struct { char mask; /* the char data is in these bits */ char lead; /* the start bytes of a utf-8 encoded char */ uint32_t beg; /* beginning of codepoint range */ uint32_t end; /* end of codepoint range */ }utf_t;

utf_t * utf[] = { [0] = &(utf_t){0b00111111, 0b10000000, 0, 0 }, [1] = &(utf_t){0b01111111, 0b00000000, 0000, 0177 }, [2] = &(utf_t){0b00011111, 0b11000000, 0200, 03777 }, [3] = &(utf_t){0b00001111, 0b11100000, 04000, 0177777 }, [4] = &(utf_t){0b00000111, 0b11110000, 0200000, 04177777}, &(utf_t){0}, };

/* All lengths are in bytes */ int codepoint_len(const uint32_t cp); /* len of associated utf-8 char */ int utf8_len(const char ch); /* len of utf-8 encoded char */

char *to_utf8(const uint32_t cp); uint32_t to_cp(const char chr[4]);

int codepoint_len(const uint32_t cp) { int len = 0; int i = 1; for(utf_t **u = utf; u; ++u) { if((cp >= (*u)->beg) && (cp <= (*u)->end)) { break; } ++len; } if(len > 4) /* Out of bounds */ exit(1);

return len; }

int utf8_len(const char ch) { int len = 0; for(utf_t **u = utf; u; ++u) { if((ch & ~(*u)->mask) == (*u)->lead) { break; } ++len; } if(len > 4) { /* Malformed leading byte */ exit(1); } return len; }

char *to_utf8(const uint32_t cp) { static char ret[5]; const int bytes = codepoint_len(cp);

int shift = 0; for(int i = bytes - 1; i; --i, shift += 6) { ret[i] = (cp >> shift & utf[0]->mask) | utf[0]->lead; } ret[0] = (cp >> shift & utf[bytes]->mask) | utf[bytes]->lead; ret[bytes] = '\0'; return ret; }

uint32_t to_cp(const char chr[4]) { int bytes = utf8_len(*chr); int shift = 6 * (bytes - 1); uint32_t codep = (*chr++ & utf[bytes]->mask) << shift;

for(int i = 1; i < bytes; ++i, ++chr) { shift -= 6; codep |= ((char)*chr & utf[0]->mask) << shift; }

return codep; }

int main(void) { const uint32_t *input = (uint32_t []){0x0041, 0x00f6, 0x0416, 0x20ac, 0x1d11e, 0x0};

printf("Character Unicode UTF-8 encoding (hex)\n"); printf("----------------------------------------\n");

char *utf8; uint32_t codepoint; for(; *input; ++input) { utf8 = to_utf8(*input); codepoint = to_cp(utf8); printf("%s U+%-7.4x", utf8, codepoint);

for(int i = 0; utf8[i] && i < 4; ++i) { printf("%hhx ", utf8[i]); } printf("\n"); } return 0; } </lang> Output <lang> Character Unicode UTF-8 encoding (hex)


A U+0041 41 ö U+00f6 c3 b6 Ж U+0416 d0 96 € U+20ac e2 82 ac 𝄞 U+1d11e f0 9d 84 9e

</lang>

D

<lang D>import std.conv; import std.stdio;

immutable CHARS = ["A","ö","Ж","€","𝄞"];

void main() {

   writeln("Character   Code-Point   Code-Units");
   foreach (c; CHARS) {
       auto bytes = cast(ubyte[]) c; //The raw bytes of a character can be accessed by casting
       auto unicode = cast(uint) to!dstring(c)[0]; //Convert from a UTF8 string to a UTF32 string, and cast the first character to a number
       writefln("%s              %7X   [%(%X, %)]", c, unicode, bytes);
   }

}</lang>

Output:
Character   Code-Point   Code-Units
A                   41   [41]
ö                   F6   [C3, B6]
Ж                  416   [D0, 96]
€                 20AC   [E2, 82, AC]
𝄞                1D11E   [F0, 9D, 84, 9E]

Elena

ELENA 3.3 : <lang elena>import system'routines. import extensions.

literal extension op {

   literal printAsString
   [
      console print(self," ")
   ]
   literal printAsUTF8Array    
   [
       self toByteArray; forEach(:b) [ console print(b toLiteral(16)," ") ].
   ]
   
   printAsUTF32
   [
       self toArray; forEach(:c)[ console print("U+",c toInt; toLiteral(16)," ")  ].
   ]

}

program = [

   "A" printAsString; printAsUTF8Array; printAsUTF32.
   console printLine.
   
   "ö" printAsString; printAsUTF8Array; printAsUTF32.
   console printLine.
   "Ж" printAsString; printAsUTF8Array; printAsUTF32.
   console printLine.
   "€" printAsString; printAsUTF8Array; printAsUTF32.
   console printLine.
   "𝄞" printAsString; printAsUTF8Array; printAsUTF32.
   console printLine.

].</lang>

Output:
A 41 U+41 
ö C3 B6 U+F6 
Ж D0 96 U+416 
€ E2 82 AC U+20AC 
𝄞 F0 9D 84 9E U+1D11E

F#

<lang fsharp> // Unicode character point to UTF8. Nigel Galloway: March 19th., 2018 let fN g = match List.findIndex (fun n->n>g) [0x80;0x800;0x10000;0x110000] with

          |0->[g]
          |1->[0xc0+(g&&&0x7c0>>>6);0x80+(g&&&0x3f)]
          |2->[0xe0+(g&&&0xf000>>>12);0x80+(g&&&0xfc0>>>6);0x80+(g&&&0x3f)]
          |_->[0xf0+(g&&&0x1c0000>>>18);0x80+(g&&&0x3f000>>>12);0x80+(g&&&0xfc0>>>6);0x80+(g&&&0x3f)]

</lang>

Output:
for n in fN 0x41    do printf "%x " n -> 41
for n in fN 0xf6    do printf "%x " n -> c3 b6 
for n in fN 0x416   do printf "%x " n -> d0 96 
for n in fN 0x20ac  do printf "%x " n -> e2 82 ac 
for n in fN 0x1d11e do printf "%x " n -> f0 9d 84 9e 

Go

Implementation

This implementation is missing all checks for invalid data and so is not production-ready, but illustrates the basic UTF-8 encoding scheme. <lang go>package main

import (

   "bytes"
   "encoding/hex"
   "fmt"
   "log"
   "strings"

)

var testCases = []struct {

   rune
   string

}{

   {'A', "41"},
   {'ö', "C3 B6"},
   {'Ж', "D0 96"},
   {'€', "E2 82 AC"},
   {'𝄞', "F0 9D 84 9E"},

}

func main() {

   for _, tc := range testCases {
       // derive some things from test data
       u := fmt.Sprintf("U+%04X", tc.rune)
       b, err := hex.DecodeString(strings.Replace(tc.string, " ", "", -1))
       if err != nil {
           log.Fatal("bad test data")
       }
       // exercise encoder and decoder on test data
       e := encodeUTF8(tc.rune)
       d := decodeUTF8(b)
       // show function return values
       fmt.Printf("%c  %-7s  %X\n", d, u, e)
       // validate return values against test data
       if !bytes.Equal(e, b) {
           log.Fatal("encodeUTF8 wrong")
       }
       if d != tc.rune {
           log.Fatal("decodeUTF8 wrong")
       }
   }

}

const (

   // first byte of a 2-byte encoding starts 110 and carries 5 bits of data
   b2Lead = 0xC0 // 1100 0000
   b2Mask = 0x1F // 0001 1111
   // first byte of a 3-byte encoding starts 1110 and carries 4 bits of data
   b3Lead = 0xE0 // 1110 0000
   b3Mask = 0x0F // 0000 1111
   // first byte of a 4-byte encoding starts 11110 and carries 3 bits of data
   b4Lead = 0xF0 // 1111 0000
   b4Mask = 0x07 // 0000 0111
   // non-first bytes start 10 and carry 6 bits of data
   mbLead = 0x80 // 1000 0000
   mbMask = 0x3F // 0011 1111

)

func encodeUTF8(r rune) []byte {

   switch i := uint32(r); {
   case i <= 1<<7-1: // max code point that encodes into a single byte
       return []byte{byte(r)}
   case i <= 1<<11-1: // into two bytes
       return []byte{
           b2Lead | byte(r>>6),
           mbLead | byte(r)&mbMask}
   case i <= 1<<16-1: // three
       return []byte{
           b3Lead | byte(r>>12),
           mbLead | byte(r>>6)&mbMask,
           mbLead | byte(r)&mbMask}
   default:
       return []byte{
           b4Lead | byte(r>>18),
           mbLead | byte(r>>12)&mbMask,
           mbLead | byte(r>>6)&mbMask,
           mbLead | byte(r)&mbMask}
   }

}

func decodeUTF8(b []byte) rune {

   switch b0 := b[0]; {
   case b0 < 0x80:
       return rune(b0)
   case b0 < 0xE0:
       return rune(b0&b2Mask)<<6 |
           rune(b[1]&mbMask)
   case b0 < 0xF0:
       return rune(b0&b3Mask)<<12 |
           rune(b[1]&mbMask)<<6 |
           rune(b[2]&mbMask)
   default:
       return rune(b0&b4Mask)<<18 |
           rune(b[1]&mbMask)<<12 |
           rune(b[2]&mbMask)<<6 |
           rune(b[3]&mbMask)
   }

}</lang>

Output:
A  U+0041   41
ö  U+00F6   C3B6
Ж  U+0416   D096
€  U+20AC   E282AC
𝄞  U+1D11E  F09D849E

Library/language

<lang go>package main

import (

   "fmt"
   "unicode/utf8"

)

func utf8encode(codepoint rune) []byte {

   buffer := make([]byte, 4)
   length := utf8.EncodeRune(buffer, codepoint)
   return buffer[:length]

}

func utf8decode(bytes []byte) rune {

   result, _ := utf8.DecodeRune(bytes)
   return result

}

func main() {

       fmt.Printf("%-7s %7s\t%s\t%s\n", "Char", "Unicode", "UTF-8 encoded", "Decoded");
   for _, codepoint := range []rune{'A', 'ö', 'Ж', '€', '𝄞'} {
       encoded := utf8encode(codepoint)
       decoded := utf8decode(encoded)
       fmt.Printf("%-7c U+%04X\t%-12X\t%c\n", codepoint, codepoint, encoded, decoded)
   }

}</lang>

Output:
Char    Unicode	UTF-8 encoded	Decoded
A       U+0041	41          	A
ö       U+00F6	C3B6        	ö
Ж       U+0416	D096        	Ж
€       U+20AC	E282AC      	€
𝄞       U+1D11E	F09D849E    	𝄞

Alternately: <lang go>package main

import (

   "fmt"

)

func utf8encode(codepoint rune) []byte {

   return []byte(string([]rune{codepoint}))

}

func utf8decode(bytes []byte) rune {

   return []rune(string(bytes))[0]

}

func main() {

       fmt.Printf("%-7s %7s\t%s\t%s\n", "Char", "Unicode", "UTF-8 encoded", "Decoded");
   for _, codepoint := range []rune{'A', 'ö', 'Ж', '€', '𝄞'} {
       encoded := utf8encode(codepoint)
       decoded := utf8decode(encoded)
       fmt.Printf("%-7c U+%04X\t%-12X\t%c\n", codepoint, codepoint, encoded, decoded)
   }

}</lang>

Output:
Char    Unicode	UTF-8 encoded	Decoded
A       U+0041	41          	A
ö       U+00F6	C3B6        	ö
Ж       U+0416	D096        	Ж
€       U+20AC	E282AC      	€
𝄞       U+1D11E	F09D849E    	𝄞

Haskell

Example makes use of bytestring and text packages:

<lang haskell>module Main (main) where

import qualified Data.ByteString as ByteString (pack, unpack) import Data.Char (chr, ord) import Data.Foldable (for_) import Data.List (intercalate) import qualified Data.Text as Text (head, singleton) import qualified Data.Text.Encoding as Text (decodeUtf8, encodeUtf8) import Text.Printf (printf)

encodeCodepoint :: Int -> [Int] encodeCodepoint = map fromIntegral . ByteString.unpack . Text.encodeUtf8 . Text.singleton . chr

decodeToCodepoint :: [Int] -> Int decodeToCodepoint = ord . Text.head . Text.decodeUtf8 . ByteString.pack . map fromIntegral

main :: IO () main = do

   putStrLn "Character  Unicode  UTF-8 encoding (hex)  Decoded"
   putStrLn "-------------------------------------------------"
   for_ [0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E] $ \codepoint -> do
       let values = encodeCodepoint codepoint
           codepoint' = decodeToCodepoint values
       putStrLn $ printf "%c          %-7s  %-20s  %c"
           codepoint
           (printf "U+%04X" codepoint :: String)
           (intercalate " " (map (printf "%02X") values))
           codepoint'</lang>
Output:
Character  Unicode  UTF-8 encoding (hex)  Decoded
-------------------------------------------------
A          U+0041   41                    A
ö          U+00F6   C3 B6                 ö
Ж          U+0416   D0 96                 Ж
€          U+20AC   E2 82 AC              €
𝄞          U+1D11E  F0 9D 84 9E           𝄞

J

Solution: <lang j>utf8=: 8&u: NB. converts to UTF-8 from unicode or unicode codepoint integer ucp=: 9&u: NB. converts to unicode from UTF-8 or unicode codepoint integer ucp_hex=: hfd@(3 u: ucp) NB. converts to unicode codepoint hexadecimal from UTF-8, unicode or unicode codepoint integer</lang>

Examples: <lang j> utf8 65 246 1046 8364 119070 AöЖ€𝄞

  ucp 65 246 1046 8364 119070

AöЖ€𝄞

  ucp 'AöЖ€𝄞'

AöЖ€𝄞

  utf8 ucp 65 246 1046 8364 119070

AöЖ€𝄞

  ucp_hex utf8 65 246 1046 8364 119070

00041 000f6 00416 020ac 1d11e

  utf8@dfh ucp_hex utf8 65 246 1046 8364 119070

AöЖ€𝄞</lang>

Java

Works with: Java version 7+

<lang java>import java.nio.charset.StandardCharsets; import java.util.Formatter;

public class UTF8EncodeDecode {

   public static byte[] utf8encode(int codepoint) {
       return new String(new int[]{codepoint}, 0, 1).getBytes(StandardCharsets.UTF_8);
   }
   public static int utf8decode(byte[] bytes) {
       return new String(bytes, StandardCharsets.UTF_8).codePointAt(0);
   }
   public static void main(String[] args) {
       System.out.printf("%-7s %-43s %7s\t%s\t%7s%n",
               "Char", "Name", "Unicode", "UTF-8 encoded", "Decoded");
       for (int codepoint : new int[]{0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E}) {
           byte[] encoded = utf8encode(codepoint);
           Formatter formatter = new Formatter();
           for (byte b : encoded) {
               formatter.format("%02X ", b);
           }
           String encodedHex = formatter.toString();
           int decoded = utf8decode(encoded);
           System.out.printf("%-7c %-43s U+%04X\t%-12s\tU+%04X%n",
                   codepoint, Character.getName(codepoint), codepoint, encodedHex, decoded);
       }
   }

}</lang>

Output:
Char    Name                                        Unicode	UTF-8 encoded	Decoded
A       LATIN CAPITAL LETTER A                      U+0041	41          	A
ö       LATIN SMALL LETTER O WITH DIAERESIS         U+00F6	C3 B6       	ö
Ж       CYRILLIC CAPITAL LETTER ZHE                 U+0416	D0 96       	Ж
€       EURO SIGN                                   U+20AC	E2 82 AC    	€
𝄞      MUSICAL SYMBOL G CLEF                       U+1D11E	F0 9D 84 9E 	𝄞

JavaScript

An implementation in ECMAScript 2015 (ES6): <lang javascript> /***************************************************************************\ |* Pure UTF-8 handling without detailed error reporting functionality. *| |***************************************************************************| |* utf8encode *| |* < String character or UInt32 code point *| |* > Uint8Array encoded_character *| |* | ErrorString *| |* *| |* utf8encode takes a string or uint32 representing a single code point *| |* as its argument and returns an array of length 1 up to 4 containing *| |* utf8 code units representing that character. *| |***************************************************************************| |* utf8decode *| |* < Unit8Array [highendbyte highmidendbyte lowmidendbyte lowendbyte] *| |* > uint32 character *| |* | ErrorString *| |* *| |* utf8decode takes an array of one to four uint8 representing utf8 code *| |* units and returns a uint32 representing that code point. *| \***************************************************************************/

const

 utf8encode=
   n=>
     (m=>
       m<0x80
      ?Uint8Array.from(
         [ m>>0&0x7f|0x00])
      :m<0x800
      ?Uint8Array.from(
         [ m>>6&0x1f|0xc0,m>>0&0x3f|0x80])
      :m<0x10000
      ?Uint8Array.from(
         [ m>>12&0x0f|0xe0,m>>6&0x3f|0x80,m>>0&0x3f|0x80])
      :m<0x110000
      ?Uint8Array.from(
         [ m>>18&0x07|0xf0,m>>12&0x3f|0x80,m>>6&0x3f|0x80,m>>0&0x3f|0x80])
      :(()=>{throw'Invalid Unicode Code Point!'})())
     ( typeof n==='string'
      ?n.codePointAt(0)
      :n&0x1fffff),
 utf8decode=
   ([m,n,o,p])=>
     m<0x80
    ?( m&0x7f)<<0
    :0xc1<m&&m<0xe0&&n===(n&0xbf)
    ?( m&0x1f)<<6|( n&0x3f)<<0
    :( m===0xe0&&0x9f<n&&n<0xc0
     ||0xe0<m&&m<0xed&&0x7f<n&&n<0xc0
     ||m===0xed&&0x7f<n&&n<0xa0
     ||0xed<m&&m<0xf0&&0x7f<n&&n<0xc0)
   &&o===o&0xbf
    ?( m&0x0f)<<12|( n&0x3f)<<6|( o&0x3f)<<0
    :( m===0xf0&&0x8f<n&&n<0xc0
     ||m===0xf4&&0x7f<n&&n<0x90
     ||0xf0<m&&m<0xf4&&0x7f<n&&n<0xc0)
   &&o===o&0xbf&&p===p&0xbf
    ?( m&0x07)<<18|( n&0x3f)<<12|( o&0x3f)<<6|( p&0x3f)<<0
    :(()=>{throw'Invalid UTF-8 encoding!'})()

</lang> The testing inputs: <lang javascript> const

 str=
   'AöЖ€𝄞'
,cps=
   Uint32Array.from(str,s=>s.codePointAt(0))
,cus=
   [ [ 0x41]
    ,[ 0xc3,0xb6]
    ,[ 0xd0,0x96]
    ,[ 0xe2,0x82,0xac]
    ,[ 0xf0,0x9d,0x84,0x9e]]
  .map(a=>Uint8Array.from(a))
,zip3=
   ([a,...as],[b,...bs],[c,...cs])=>
     0<as.length+bs.length+cs.length
    ?[ [ a,b,c],...zip3(as,bs,cs)]
    :[ [ a,b,c]]
,inputs=zip3(str,cps,cus);

</lang> The testing code: <lang javascript> console.log(`\ ${'Character'.padEnd(16)}\ ${'CodePoint'.padEnd(16)}\ ${'CodeUnits'.padEnd(16)}\ ${'uft8encode(ch)'.padEnd(16)}\ ${'uft8encode(cp)'.padEnd(16)}\ utf8decode(cu)`) for(let [ch,cp,cu] of inputs)

 console.log(`\

${ch.padEnd(16)}\ ${cp.toString(0x10).padStart(8,'U+000000').padEnd(16)}\ ${`[${[...cu].map(n=>n.toString(0x10))}]`.padEnd(16)}\ ${`[${[...utf8encode(ch)].map(n=>n.toString(0x10))}]`.padEnd(16)}\ ${`[${[...utf8encode(cp)].map(n=>n.toString(0x10))}]`.padEnd(16)}\ ${utf8decode(cu).toString(0x10).padStart(8,'U+000000')}`) </lang> and finally, the output from the test:

Character       CodePoint       CodeUnits       uft8encode(ch)  uft8encode(cp)  utf8decode(cu)
A               U+000041        [41]            [41]            [41]            U+000041
ö               U+0000f6        [c3,b6]         [c3,b6]         [c3,b6]         U+0000f6
Ж               U+000416        [d0,96]         [d0,96]         [d0,96]         U+000416
€               U+0020ac        [e2,82,ac]      [e2,82,ac]      [e2,82,ac]      U+0020ac
𝄞              U+01d11e        [f0,9d,84,9e]   [f0,9d,84,9e]   [f0,9d,84,9e]   U+01d11e

Note that the misalign there on the last line is caused by the string length of astral characters being 2 so the padding functions break.

Julia

Works with: Julia version 0.6

Julia supports by default UTF-8 encoding.

<lang julia>for t in ("A", "ö", "Ж", "€", "𝄞")

   enc = Vector{UInt8}(t)
   dec = String(enc)
   println(dec, " → ", enc)

end</lang>

Output:
A → UInt8[0x41]
ö → UInt8[0xc3, 0xb6]
Ж → UInt8[0xd0, 0x96]
€ → UInt8[0xe2, 0x82, 0xac]
𝄞 → UInt8[0xf0, 0x9d, 0x84, 0x9e]

Kotlin

<lang scala>// version 1.1.2

fun utf8Encode(codePoint: Int) = String(intArrayOf(codePoint), 0, 1).toByteArray(Charsets.UTF_8)

fun utf8Decode(bytes: ByteArray) = String(bytes, Charsets.UTF_8).codePointAt(0)

fun main(args: Array<String>) {

   val codePoints = intArrayOf(0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E)
   println("Char  Name                                 Unicode  UTF-8         Decoded")
   for (codePoint in codePoints) {
       var n = if(codePoint <= 0xFFFF) 4 else 5 
       System.out.printf("%-${n}c  %-35s  U+%05X  ", codePoint, Character.getName(codePoint), codePoint)  
       val bytes = utf8Encode(codePoint)
       var s = ""
       for (byte in bytes) s += "%02X ".format(byte)
       val decoded = utf8Decode(bytes)
       n = if(decoded.toInt() <= 0xFFFF) 12 else 11 
       System.out.printf("%-${n}s  %c\n", s, decoded)  
   } 

}</lang>

Output:
Char  Name                                 Unicode  UTF-8         Decoded
A     LATIN CAPITAL LETTER A               U+00041  41            A
ö     LATIN SMALL LETTER O WITH DIAERESIS  U+000F6  C3 B6         ö
Ж     CYRILLIC CAPITAL LETTER ZHE          U+00416  D0 96         Ж
€     EURO SIGN                            U+020AC  E2 82 AC      €
𝄞     MUSICAL SYMBOL G CLEF                U+1D11E  F0 9D 84 9E   𝄞

Lingo

Since UTF-8 is Lingo's native string encoding, and UTF-8 strings can be read into byteArrays (and v.v.), such UTF-8 encoding and decoding is built-in.
Relevant Lingo functions are:
- charToNum (string): converts single-character string to unicode code point (int)
- numToChar (int): converts unicode code point (int) to single-character string
- byteArray (string): creates byte array of UTF-8 bytes for string
- byteArray.toHexString (intStart, intLen): returns hex string representation of byte array (e.g. for printing)
- byteArray.readRawString (intLen, [strCharSet="UTF-8"]): reads a fixed number of bytes as a string <lang Lingo>chars = ["A", "ö", "Ж", "€", "𝄞"] put "Character Unicode (int) UTF-8 (hex) Decoded" repeat with c in chars

   ba = bytearray(c)
   put col(c, 12) & col(charToNum(c), 16) & col(ba.toHexString(1, ba.length), 14) & ba.readRawString(ba.length)

end repeat</lang> Helper function for table formatting <lang Lingo>on col (val, len)

   str = string(val)
   repeat with i = str.length+1 to len
       put " " after str
   end repeat
   return str

end</lang>

Output:
Character   Unicode (int)   UTF-8 (hex)   Decoded
A           65              41            A
ö           246             c3 b6         ö
Ж           1046            d0 96         Ж
€           8364            e2 82 ac      €
𝄞           119070          f0 9d 84 9e   𝄞

Mathematica

<lang Mathematica>utf = ToCharacterCode[ToString["AöЖ€", CharacterEncoding -> "UTF8"]] ToCharacterCode[FromCharacterCode[utf, "UTF8"]]</lang>

Output:
{65, 195, 182, 208, 150, 226, 130, 172}
{65, 246, 1046, 8364}

Perl

<lang perl>#!/usr/bin/perl use strict; use warnings; use Unicode::UCD 'charinfo'; # getting the unicode name of the character use utf8; # using non-ascii-characters in source code binmode STDOUT, ":encoding(UTF-8)"; # printing non-ascii-characters to screen

my @chars = map {ord} qw/A ö Ж € 𝄞/; # @chars contains the unicode points my $print_format = '%5s  %-35s'; printf "$print_format %8s %s\n" , 'char', 'name', 'unicode', 'utf-8 encoding'; map{ my $name = charinfo($_)->{'name'}; # get unicode name printf "$print_format %06x " , chr, lc $name, $_; my $utf8 = chr; # single char (using implicit $_) utf8::encode($utf8); # inplace encoding into utf8 parts map{ # for each utf8 char print ord printf " %x", ord; } split //, $utf8; print "\n"; } @chars;</lang>

Output:
 char  name                                 unicode  utf-8 encoding
    A  latin capital letter a               000041   41
    ö  latin small letter o with diaeresis  0000f6   c3 b6
    Ж  cyrillic capital letter zhe          000416   d0 96
    €  euro sign                            0020ac   e2 82 ac
    𝄞  musical symbol g clef                01d11e   f0 9d 84 9e

Perl 6

Works with: Rakudo version 2017.02

Pretty much all built in to the language. <lang perl6>say sprintf("%-18s %-36s|%8s| %7s |%14s | %s\n", 'Character|', 'Name', 'Ordinal', 'Unicode', 'UTF-8 encoded', 'decoded'), '-' x 100;

for < A ö Ж € 𝄞 😜 👨‍👩‍👧‍👦> -> $char {

   printf "   %-5s | %-43s | %6s | %-7s | %12s  |%4s\n", $char, $char.uninames.join(','), $char.ords.join(' '),
     ('U+' X~ $char.ords».base(16)).join(' '), $char.encode('UTF8').list».base(16).Str, $char.encode('UTF8').decode;

}</lang>

Output:
Character|         Name                                | Ordinal| Unicode | UTF-8 encoded | decoded
----------------------------------------------------------------------------------------------------
   A     | LATIN CAPITAL LETTER A                      |     65 | U+41    |           41  |   A
   ö     | LATIN SMALL LETTER O WITH DIAERESIS         |    246 | U+F6    |        C3 B6  |   ö
   Ж    | CYRILLIC CAPITAL LETTER ZHE                 |   1046 | U+416   |        D0 96  |   Ж
   €     | EURO SIGN                                   |   8364 | U+20AC  |     E2 82 AC  |   €
   𝄞     | MUSICAL SYMBOL G CLEF                       | 119070 | U+1D11E |  F0 9D 84 9E  |   𝄞
   😜    | FACE WITH STUCK-OUT TONGUE AND WINKING EYE  | 128540 | U+1F61C |  F0 9F 98 9C  |   😜
   👨‍👩‍👧‍👦    | MAN,ZERO WIDTH JOINER,WOMAN,ZERO WIDTH JOINER,GIRL,ZERO WIDTH JOINER,BOY | 128104 8205 128105 8205 128103 8205 128102 | U+1F468 U+200D U+1F469 U+200D U+1F467 U+200D U+1F466 | F0 9F 91 A8 E2 80 8D F0 9F 91 A9 E2 80 8D F0 9F 91 A7 E2 80 8D F0 9F 91 A6  |   👨‍👩‍👧‍👦

Phix

Standard autoinclude, see the manual and/or builtins/utfconv.e ( http://phix.x10.mx/docs/html/utfconv.htm and/or https://bitbucket.org/petelomax/phix/src )
As requested in the task description: <lang Phix>constant tests = {#0041, #00F6, #0416, #20AC, #1D11E}

function hex(sequence s, string fmt) -- output helper

   for i=1 to length(s) do
       s[i] = sprintf(fmt,s[i])
   end for
   return join(s,',')

end function

for i=1 to length(tests) do

   integer codepoint = tests[i]
   sequence s = utf32_to_utf8({codepoint}),
            r = utf8_to_utf32(s)
   printf(1,"#%04x -> {%s} -> {%s}\n",{codepoint, hex(s,"#%02x"),hex(r,"#%04x")})

end for</lang>

Output:
#0041 -> {#41} -> {#0041}
#00F6 -> {#C3,#B6} -> {#00F6}
#0416 -> {#D0,#96} -> {#0416}
#20AC -> {#E2,#82,#AC} -> {#20AC}
#1D11E -> {#F0,#9D,#84,#9E} -> {#1D11E}

Python

<lang python>

  1. !/usr/bin/env python3

from unicodedata import name


def unicode_code(ch):

   return 'U+{:04x}'.format(ord(ch))


def utf8hex(ch):

   return " ".join([hex(c)[2:] for c in ch.encode('utf8')]).upper()


if __name__ == "__main__":

   print('{:<11} {:<36} {:<15} {:<15}'.format('Character', 'Name', 'Unicode', 'UTF-8 encoding (hex)'))
   chars = ['A', 'ö', 'Ж', '€', '𝄞']
   for char in chars:
       print('{:<11} {:<36} {:<15} {:<15}'.format(char, name(char), unicode_code(char), utf8hex(char)))</lang>
Output:
Character   Name                                 Unicode         UTF-8 encoding (hex)
A           LATIN CAPITAL LETTER A               U+0041          41             
ö           LATIN SMALL LETTER O WITH DIAERESIS  U+00f6          C3 B6          
Ж           CYRILLIC CAPITAL LETTER ZHE          U+0416          D0 96          
€           EURO SIGN                            U+20ac          E2 82 AC       
𝄞           MUSICAL SYMBOL G CLEF                U+1d11e         F0 9D 84 9E

Racket

<lang racket>#lang racket

(define char-map

 '((LATIN-CAPITAL-LETTER-A              .  #\U0041)
   (LATIN-SMALL-LETTER-O-WITH-DIAERESIS .  #\U00F6)
   (CYRILLIC-CAPITAL-LETTER-ZHE         .  #\U0416)
   (EURO-SIGN                           .  #\U20AC)
   (MUSICAL-SYMBOL-G-CLEF               .  #\U1D11E)))

(for ((name.char (in-list char-map)))

 (define name (car name.char))
 (define chr (cdr name.char))
 (let ((bites (bytes->list (string->bytes/utf-8 (list->string (list chr))))))
   (printf "~s\t~a\t~a\t~a\t~a~%" chr chr
           (map (curryr number->string 16) bites)
           (bytes->string/utf-8 (list->bytes bites))
           name)))</lang>
Output:
#\A	A	(41)	A	LATIN-CAPITAL-LETTER-A
#\ö	ö	(c3 b6)	ö	LATIN-SMALL-LETTER-O-WITH-DIAERESIS
#\Ж	Ж	(d0 96)	Ж	CYRILLIC-CAPITAL-LETTER-ZHE
#\€	€	(e2 82 ac)	€	EURO-SIGN
#\𝄞	𝄞	(f0 9d 84 9e)	𝄞	MUSICAL-SYMBOL-G-CLEF

Scala

Imperative solution

<lang scala>object UTF8EncodeAndDecode extends App {

 val codePoints = Seq(0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E)
 def utf8Encode(codepoint: Int): Array[Byte] =
   new String(Array[Int](codepoint), 0, 1).getBytes(StandardCharsets.UTF_8)
 def utf8Decode(bytes: Array[Byte]): Int =
   new String(bytes, StandardCharsets.UTF_8).codePointAt(0)
 println("Char Name                                 Unicode  UTF-8       Decoded")
 for (codePoint <- codePoints) {
   val w = if (Character.isBmpCodePoint(codePoint)) 4 else 5 // Compute spacing
   val bytes = utf8Encode(codePoint)
   def leftAlignedHex = f"U+${codePoint}%04X"
   val s = new StringBuilder()
   bytes.foreach(byte => s ++= "%02X ".format(byte))
   printf(s"%-${w}c %-36s %-7s  %-${16 - w}s%c%n",
     codePoint, Character.getName(codePoint), leftAlignedHex, s, utf8Decode(bytes))
 }</lang>

Functional solution

<lang scala>import java.nio.charset.StandardCharsets

object UTF8EncodeAndDecode extends App {

 val codePoints = Seq(0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E)
 def utf8Encode(codepoint: Int): Array[Byte] =
   new String(Array[Int](codepoint), 0, 1).getBytes(StandardCharsets.UTF_8)
 def utf8Decode(bytes: Array[Byte]): Int =
   new String(bytes, StandardCharsets.UTF_8).codePointAt(0)
 println("Char Name                                 Unicode  UTF-8       Decoded")
 codePoints.foreach{ codePoint =>
   val w = if (Character.isBmpCodePoint(codePoint)) 4 else 5 // Compute spacing
   val bytes = utf8Encode(codePoint)
   def leftAlignedHex: String = f"U+${codePoint}%04X"
   def utf: String = bytes.foldLeft("")(_ + "%02X ".format(_))
   printf(s"%-${w}c %-36s %-7s  %-${16 - w}s%c%n",
     codePoint, Character.getName(codePoint), leftAlignedHex, utf, utf8Decode(bytes))  }
 println(s"\nSuccessfully completed without errors. [total ${scala.compat.Platform.currentTime - executionStart} ms]")

}</lang>

Composable and testable solution

<lang scala>package example

object UTF8EncodeAndDecode extends TheMeat with App {

 val codePoints = Seq(0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E)
 println("Char Name                                 Unicode  UTF-8       Decoded")
 codePoints.foreach { codepoint => print(composeString(codepoint)) }
 println(s"\nSuccessfully completed without errors. [total ${scala.compat.Platform.currentTime - executionStart} ms]")

}

trait TheMeat {

 import java.nio.charset.StandardCharsets
 def composeString(codePoint: Int): String = {
   val w = if (Character.isBmpCodePoint(codePoint)) 4 else 5 // Compute spacing
   val bytes = utf8Encode(codePoint)
   def leftAlignedHex: String = f"U+${codePoint}%04X"
   def utf: String = bytes.foldLeft("")(_ + "%02X ".format(_))
   s"%-${w}c %-36s %-7s  %-${16 - w}s%c%n"
     .format(codePoint, Character.getName(codePoint), leftAlignedHex, utf, utf8Decode(bytes))
 }
 def utf8Encode(codepoint: Int): Array[Byte] =
   new String(Array[Int](codepoint), 0, 1).getBytes(StandardCharsets.UTF_8)
 def utf8Decode(bytes: Array[Byte]): Int =
   new String(bytes, StandardCharsets.UTF_8).codePointAt(0)

} </lang>

Sidef

<lang ruby>func utf8_encoder(Number code) {

   code.chr.encode('UTF-8').bytes.map{.chr}

}

func utf8_decoder(Array bytes) {

   bytes.map{.ord}.decode('UTF-8')

}

for n in ([0x0041, 0x00F6, 0x0416, 0x20AC, 0x1D11E]) {

   var encoded = utf8_encoder(n)
   var decoded = utf8_decoder(encoded)
   assert_eq(n, decoded.ord)
   say "#{decoded} -> #{encoded}"

}</lang>

Output:
A -> ["A"]
ö -> ["\xC3", "\xB6"]
Ж -> ["\xD0", "\x96"]
€ -> ["\xE2", "\x82", "\xAC"]
𝄞 -> ["\xF0", "\x9D", "\x84", "\x9E"]

Tcl

Note: Tcl can handle Unicodes only up to U+FFFD, i.e. the Basic Multilingual Plane (BMP, 16 bits wide). Therefore, the fifth test fails as expected. <lang Tcl>proc encoder int {

  set u [format %c $int]
  set bytes {}
  foreach byte [split [encoding convertto utf-8 $u] ""] {
     lappend bytes [format %02X [scan $byte %c]]
  }
  return $bytes

} proc decoder bytes {

  set str {}
  foreach byte $bytes {
     append str [format %c [scan $byte %x]]
  }
  return [encoding convertfrom utf-8 $str]

} foreach test {0x0041 0x00f6 0x0416 0x20ac 0x1d11e} {

  set res $test
  lappend res [encoder $test] -> [decoder [encoder $test]]
  puts $res

}</lang>

0x0041 41 -> A
0x00f6 {C3 B6} -> ö
0x0416 {D0 96} -> Ж
0x20ac {E2 82 AC} -> €
0x1d11e {EF BF BD} -> �

Alternative Implementation

While perhaps not as readable as the above, this version handles beyond-BMP codepoints by manually composing the utf-8 byte sequences and emitting raw bytes to the console. encoding convertto utf-8 command still does the heavy lifting where it can.

<lang Tcl>proc utf8 {codepoint} {

   scan $codepoint %llx cp
   if {$cp < 0x10000} {
       set str [subst \\u$codepoint]               ;# substitute per Tcl backslash rule
       set bytes [encoding convertto utf-8 $str]   ;# encode
   } else {                                        ;# codepoints beyond the BMP need manual approach
       set bits [format %021b $cp]                 ;# format as binary string
       set unibits    11110[string range $bits 0 2];# insert extra bits for utf-8 4-byte encoding
       append unibits 10[string range $bits 3 8]
       append unibits 10[string range $bits 9 14]
       append unibits 10[string range $bits 15 20]
       set bytes [binary format B* $unibits]       ;# turn into a sequence of bytes
   }
   return $bytes

}

proc hexchars {s} {

   binary scan $s H* hex
   regsub -all .. $hex {\0 }

}

  1. for the test, we assume the tty is in utf-8 mode and can handle beyond-BMP chars
  2. so set output mode to binary so we can write raw bytes!

chan configure stdout -encoding binary foreach codepoint { 41 F6 416 20AC 1D11E } {

   set utf8 [utf8 $codepoint]
   puts "[format U+%04s $codepoint]\t$utf8\t[hexchars $utf8]"

}</lang>

Output:
U+0041  A       41

U+00F6 ö c3 b6 U+0416 Ж d0 96 U+20AC € e2 82 ac U+1D11E 𝄞 f0 9d 84 9e

zkl

<lang zkl>println("Char Unicode UTF-8"); foreach utf,unicode_int in (T( T("\U41;",0x41), T("\Uf6;",0xf6),

     T("\U416;",0x416), T("\U20AC;",0x20ac), T("\U1D11E;",0x1d11e))){
  utf_int:=utf.reduce(fcn(s,c){ 0x100*s + c.toAsc() },0);
  char :=unicode_int.toString(-8);	// Unicode int to UTF-8 string
  // UTF-8 bytes to UTF-8 string:
  char2:=Data(Void,utf_int.toBigEndian(utf_int.len())).text;
  println("%s %s %9s  %x".fmt(char,char2,"U+%x".fmt(unicode_int),utf_int));

}</lang> Int.len() --> number of bytes in int. This could be hard coded because UTF-8 has a max of 6 bytes and (0x41).toBigEndian(6) --> 0x41,0,0,0,0,0 which is a zero terminated string ("A");

Output:
Char  Unicode  UTF-8
A A      U+41  41
ö ö      U+f6  c3b6
Ж Ж     U+416  d096
€ €    U+20ac  e282ac
𝄞 𝄞   U+1d11e  f09d849e