XXXX redacted: Difference between revisions

→‎{{header|Go}}: Rewritten to eliminate many of the previous problems. Now does all the Raku test cases.
(→‎{{header|Phix}}: try to make clear for other readers that a stretch goal is indeed a stretch)
(→‎{{header|Go}}: Rewritten to eliminate many of the previous problems. Now does all the Raku test cases.)
Line 63:
 
=={{header|Go}}==
{{libheader|Unicode Text Segmentation for Go}}
As far as the basic part of the task is concerned, words containing hyphens and apostrophes have needed special handling as Go's regular expression engine (which is based on RE2 and guaranteed to run in linear time relative to the size of the input) can't do stuff such as look-ahead and look-behind.
Go has a problem with zero width joiner (ZWJ) emojis such as the final one in the test string which is not recognized as a single 'character' by the language as it consists of five Unicode code-points (or 'runes') instead of one. This problem is aggravated (as here) when one of the constituents of the ZWJ emoji happens to be a 'normal' emoji contained within the same test string!
 
Care is therefore needed to ensure that when a normal emoji is being redacted it doesn't also redact one of the constituents of a ZWJ emoji.
The stretch goal has been achieved at the expense of assuming that each emoji grapheme is always followed by white-space or is at the end of the text. Go has a problem with zero width joiner (ZWJ) emojis such as the final one in the test string which is not recognized as a single 'character' by the language as it consists of five Unicode code-points (or 'runes') instead of one. This problem is aggravated (as here) when one of the constituents of the ZWJ emoji happens to be a 'normal' emoji contained within the same test string! To get the number of 'X's right where a ZWJ emoji is being replaced, the code looks for its first zero width joiner character (U+200d) and skips to the next white-space character, if there is one, after that.
 
To get the number of 'X's right where a ZWJ emoji or other character combination is being replaced, a third party library function is used which counts the number of graphemes in a string, as required by the task.
<lang go>package main
 
import (
"fmt"
"github.com/rivo/uniseg"
"log"
"regexp"
"strings"
"unicode"
"unicode/utf8"
)
 
func findNextSpacejoin(runeswords, seps []runestring) intstring {
for i, rlw := range runes {len(words)
ls := if unicode.IsSpacelen(rseps) {
if lw != ls+1 return i{
log.Fatal("mismatch between number of words and separators")
}
}
var sb strings.Builder
return -1
for i := 0; i < ls; i++ {
sb.WriteString(words[i])
sb.WriteString(seps[i])
}
sb.WriteString(words[lw-1])
return sb.String()
}
 
func redact(text, word, opts string) {
var partial, overkill bool
// record rune indices of any hyphens or apostrophes and temporarily replace them with
exp := word
// underscores so the former are not treated as word boundaries
if strings.IndexByte(opts, 'p') >= 0 {
var hyphens, aposts []int
partial = true
runes := []rune(text)
for i, r := range runes {
if r == '-' {
hyphens = append(hyphens, i)
runes[i] = '_'
} else if r == '\'' {
aposts = append(aposts, i)
runes[i] = '_'
}
}
if strings.IndexByte(opts, 'o') >= 0 {
text = string(runes)
var exp string overkill = true
switch opts {
case "[w|s|n]":
exp = `\b` + word + `\b`
case "[w|i|n]":
exp = `(?i)\b` + word + `\b`
case "[p|s|n]":
exp = word
case "[p|i|n]":
exp = `(?i)` + word
case "[p|s|o]", "[w|s|o]":
exp = `\b\w*` + word + `\w*\b`
case "[p|i|o]", "[w|i|o]":
exp = `(?i)\b\w*` + word + `\w*\b`
case "[w]":
exp = word + `(\s|$)`
}
if strings.IndexByte(opts, 'i') >= 0 {
rgx := regexp.MustCompile(exp)
if opts exp == "[w]"`(?i)` + {exp
rf := func(match string) string {
var res []rune
runes := []rune(match)
for i := 0; i < len(runes); i++ {
r := runes[i]
if r == '\u200d' { // zero width joiner character
ix := findNextSpace(runes[i+1:])
if ix == -1 {
break
} else {
i += ix + 1 // skip to next space
res = append(res, runes[i])
continue
}
}
if unicode.IsSpace(r) {
res = append(res, r)
} else {
res = append(res, 'X')
}
}
return string(res)
}
text = rgx.ReplaceAllStringFunc(text, rf)
} else if len(opts) == 7 && opts[5] == 'n' {
repl := strings.Repeat("X", utf8.RuneCountInString(word))
text = rgx.ReplaceAllLiteralString(text, repl)
} else {
rf := func(match string) string {
return strings.Repeat("X", utf8.RuneCountInString(match))
}
text = rgx.ReplaceAllStringFunc(text, rf)
}
rgx := regexp.MustCompile(`[\s!-&(-,./:-@[-^{-~]+`) // all punctuation except -'_
// put back any hyphens or apostrophes which haven't been replaced by 'X'
runesseps := []runergx.FindAllString(text, -1)
for _, iwords := range hyphensrgx.Split(text, {-1)
rgx2 := regexp.MustCompile(exp)
if runes[i] == '_' {
for i, w := range words runes[i] = '-'{
match := rgx2.FindString(w)
// check there's a match and it's not part of a ZWJ emoji
if match == "" || strings.Index(w, match+"\u200d") >= 0 ||
strings.Index(w, "\u200d"+match) >= 0 {
continue
}
} switch {
for _, i := rangecase aposts {overkill:
if runes words[i] == '_'strings.Repeat("X", {uniseg.GraphemeClusterCount(w))
case runes[i] = '\''!partial:
if words[i] == match {
words[i] = strings.Repeat("X", uniseg.GraphemeClusterCount(w))
}
case partial:
repl := strings.Repeat("X", uniseg.GraphemeClusterCount(word))
words[i] = rgx2.ReplaceAllLiteralString(w, repl)
}
}
fmt.Printf("%s %s\n\n", opts, join(words, seps))
text = string(runes)
fmt.Printf("%s %s\n\n", opts, text)
}
 
func printResults(text string, allOpts, allWords []string) {
func main() {
fmt.Printf("Text: %s\n\n", text)
text := `Tom? Toms bottom tomato is in his stomach while playing the "Tom-tom" brand tom-toms. That's so tom.
for _, word := range allWords {
'Tis very tomish, don't you think?`
allOpts := []string{"[w|s|n]", "[w|i|n]", "[p|s|n]", "[p|i|n]", "[p|s|o]", "[p|i|o]"}
for _, word := range []string{"Tom", "tom", "t"} {
fmt.Printf("Redact '%s':\n", word)
for _, opts := range allOpts {
redact(text, word, opts)
}
fmt.Println()
}
fmt.Println()
}
 
func main() {
text := `Tom? Toms bottom tomato is in his stomach while playing the "Tom-tom" brand tom-toms. That's so tom.
'Tis very tomish, don't you think?`
allOpts := []string{"[w|s|n]", "[w|i|n]", "[p|s|n]", "[p|i|n]", "[p|s|o]", "[p|i|o]"}
allWords := []string{"Tom", "tom", "t"}
printResults(text, allOpts, allWords)
 
text = "🧑 👨 🧔 👨‍👩‍👦"
forallOpts _, word := range []string{"👨", "👨‍👩‍👦[w]"} {
allWords = []string{"👨", "👨‍👩‍👦"}
fmt.Printf("Redact '%s':\n", word)
redactprintResults(text, wordallOpts, "[w]"allWords)
 
fmt.Println()
text = "Argentina🧑🇦🇹 France👨🇫🇷 Germany🧔🇩🇪 Netherlands👨‍👩‍👦🇳🇱"
}
allOpts = []string{"[p]", "[p|o]"}
printResults(text, allOpts, allWords)
}</lang>
 
{{out}}
<pre style="height:80ex;overflow:scroll;">
<pre>
Text: Tom? Toms bottom tomato is in his stomach while playing the "Tom-tom" brand tom-toms. That's so tom.
'Tis very tomish, don't you think?
 
Redact 'Tom':
[w|s|n] XXX? Toms bottom tomato is in his stomach while playing the "Tom-tom" brand tom-toms. That's so tom.
Line 207 ⟶ 183:
[p|i|o] XXX? XXXX XXXXXX XXXXXX is in his XXXXXXX while playing the "XXXXXXX" brand XXXXXXXX. That's so XXX.
'Tis very XXXXXX, don't you think?
 
 
Redact 'tom':
Line 227 ⟶ 202:
[p|i|o] XXX? XXXX XXXXXX XXXXXX is in his XXXXXXX while playing the "XXXXXXX" brand XXXXXXXX. That's so XXX.
'Tis very XXXXXX, don't you think?
 
 
Redact 't':
Line 248 ⟶ 222:
XXXX very XXXXXX, XXXXX you XXXXX?
 
 
Text: 🧑 👨 🧔 👨‍👩‍👦
 
Redact '👨':
[w] 🧑 X 🧔 👨‍👩‍👦
 
 
Redact '👨‍👩‍👦':
[w] 🧑 👨 🧔 X
</pre>
 
 
Text: Argentina🧑🇦🇹 France👨🇫🇷 Germany🧔🇩🇪 Netherlands👨‍👩‍👦🇳🇱
 
Redact '👨':
[p] Argentina🧑🇦🇹 FranceX🇫🇷 Germany🧔🇩🇪 Netherlands👨‍👩‍👦🇳🇱
 
[p|o] Argentina🧑🇦🇹 XXXXXXXX Germany🧔🇩🇪 Netherlands👨‍👩‍👦🇳🇱
 
Redact '👨‍👩‍👦':
[p] Argentina🧑🇦🇹 France👨🇫🇷 Germany🧔🇩🇪 NetherlandsX🇳🇱
 
[p|o] Argentina🧑🇦🇹 France👨🇫🇷 Germany🧔🇩🇪 XXXXXXXXXXXXX
</pre>
 
=={{header|Julia}}==
9,482

edits