Strip control codes and extended characters from a string

From Rosetta Code
Revision as of 06:40, 10 October 2011 by rosettacode>Georg Peter (Add Seed7 example)
Task
Strip control codes and extended characters from a string
You are encouraged to solve this task according to the task description, using any language you may know.

The task is to strip control codes and extended characters from a string. The solution should demonstrate how to achieve each of the following results:

  • a string with control codes stripped (but extended characters not stripped)
  • a string with control codes and extended characters stripped

In ASCII, the control codes have decimal codes 0 through to 31 and 127 and greater than 126. On an ASCII based system, if the control codes are stripped, the resultant string would have all of its characters within the range of 32 to 126 decimal on the ascii table.

On a non-ASCII based system, we consider characters that do not have a corresponding glyph on the ASCII table (within the ASCII range of 32 to 126 decimal) to be an extended character for the purpose of this task.

C

<lang C>#include <stdio.h>

  1. include <stdlib.h>
  1. define IS_CTRL (1 << 0)
  2. define IS_EXT (1 << 1)
  3. define IS_ALPHA (1 << 2)
  4. define IS_DIGIT (1 << 3) /* not used, just give you an idea */

unsigned int char_tbl[256] = {0};

/* could use ctypes, but then they pretty much do the same thing */ void init_table() { int i;

for (i = 0; i < 32; i++) char_tbl[i] |= IS_CTRL; char_tbl[127] |= IS_CTRL;

for (i = 'A'; i <= 'Z'; i++) { char_tbl[i] |= IS_ALPHA; char_tbl[i + 0x20] |= IS_ALPHA; /* lower case */ }

for (i = 128; i < 256; i++) char_tbl[i] |= IS_EXT; }

/* depends on what "stripped" means; we do it in place.

* "what" is a combination of the IS_* macros, meaning strip if
* a char IS_ any of them
*/

void strip(char * str, int what) { unsigned char *ptr, *s = (void*)str; ptr = s; while (*s != '\0') { if ((char_tbl[(int)*s] & what) == 0) *(ptr++) = *s; s++; } *ptr = '\0'; }

int main() { char a[256]; int i;

init_table();

/* populate string with one of each char */ for (i = 1; i < 255; i++) a[i - 1] = i; a[255] = '\0'; strip(a, IS_CTRL); printf("%s\n", a);

for (i = 1; i < 255; i++) a[i - 1] = i; a[255] = '\0'; strip(a, IS_CTRL | IS_EXT); printf("%s\n", a);

for (i = 1; i < 255; i++) a[i - 1] = i; a[255] = '\0'; strip(a, IS_CTRL | IS_EXT | IS_ALPHA); printf("%s\n", a);

return 0; }</lang>output:<lang> !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ <odd stuff my xterm thinks are bad unicode hence can't be properly shown>

!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~</lang>

C++

<lang Cpp>#include <string>

  1. include <iostream>
  2. include <algorithm>
  3. include <boost/lambda/lambda.hpp>
  4. include <boost/lambda/casts.hpp>
  5. include <ctime>
  6. include <cstdlib>

using namespace boost::lambda ;

struct MyRandomizer {

  char operator( )( ) {
     return static_cast<char>( rand( ) % 256 ) ;
  }

} ;

std::string deleteControls ( std::string startstring ) {

  std::string noControls( "                                        " ) ;//creating space for 
  //the standard algorithm remove_copy_if
  std::remove_copy_if( startstring.begin( ) , startstring.end( ) , noControls.begin( ) ,

ll_static_cast<int>( _1 ) < 32 && ll_static_cast<int>( _1 ) == 127 ) ;

  return noControls ;

}

std::string deleteExtended( std::string startstring ) {

  std::string noExtended ( "                                        " ) ;//same as above
  std::remove_copy_if( startstring.begin( ) , startstring.end( ) , noExtended.begin( ) ,

ll_static_cast<int>( _1 ) > 127 || ll_static_cast<int>( _1 ) < 32 ) ;

  return noExtended ;

}

int main( ) {

  std::string my_extended_string ;
  for ( int i = 0 ; i < 40 ; i++ ) //we want the extended string to be 40 characters long
     my_extended_string.append( " " ) ;
  srand( time( 0 ) ) ;
  std::generate_n( my_extended_string.begin( ) , 40 , MyRandomizer( ) ) ;
  std::string no_controls( deleteControls( my_extended_string ) ) ;
  std::string no_extended ( deleteExtended( my_extended_string ) ) ;
  std::cout << "string with all characters: " << my_extended_string << std::endl ;
  std::cout << "string without control characters: " << no_controls << std::endl ;
  std::cout << "string without extended characters: " << no_extended << std::endl ;
  return 0 ;

}</lang> Output:

string with all characters: K�O:~���7�5����
���W��@>��ȓ�q�Q@���W-
string without control characters: K�O:~���7�5����
���W��@>��ȓ�q�Q@���W-
string without extended characters: KO:~75W@>qQ@W-    

Go

Go works for ASCII and non-ASCII systems. The first pair of functions below interpret strings as byte strings, presumably useful for strings consisting of ASCII and 8-bit extended ASCII data. The second pair of functions interpret strings as UTF-8. <lang go>package main

import (

   "fmt"
   "strings"

)

// two byte-oriented functions identical except for operator comparing c to 127. func stripCtlFromBytes(str string) string {

   b := make([]byte, len(str))
   var bl int
   for i := 0; i < len(str); i++ {
       c := str[i]
       if c >= 32 && c != 127 {
           b[bl] = c
           bl++
       }
   }
   return string(b[:bl])

}

func stripCtlAndExtFromBytes(str string) string {

   b := make([]byte, len(str))
   var bl int
   for i := 0; i < len(str); i++ {
       c := str[i]
       if c >= 32 && c < 127 {
           b[bl] = c
           bl++
       }
   }
   return string(b[:bl])

}

// two UTF-8 functions identical except for operator comparing c to 127 func stripCtlFromUTF8(str string) string {

   return strings.Map(func(rune int) int {
       if rune >= 32 && rune != 127 {
           return rune
       }
       return -1
   }, str)

}

func stripCtlAndExtFromUTF8(str string) string {

   return strings.Map(func(rune int) int {
       if rune >= 32 && rune < 127 {
           return rune
       }
       return -1
   }, str)

}

const src = "déjà vu" + // precomposed unicode

   "\n\000\037 \041\176\177\200\377\n" +  // various boundary cases
   "as⃝df̅"                               // unicode combining characters

func main() {

   fmt.Println("source text:")
   fmt.Println(src, "\n")
   fmt.Println("as bytes, stripped of control codes:")
   fmt.Println(stripCtlFromBytes(src), "\n")
   fmt.Println("as bytes, stripped of control codes and extended characters:")
   fmt.Println(stripCtlAndExtFromBytes(src), "\n")
   fmt.Println("as UTF-8, stripped of control codes:")
   fmt.Println(stripCtlFromUTF8(src), "\n")
   fmt.Println("as UTF-8, stripped of control codes and extended characters:")
   fmt.Println(stripCtlAndExtFromUTF8(src))

} </lang> Output: (varies with display configuration)

source text:
déjà vu
� !~?��
as⃝df̅ 

as bytes, stripped of control codes:
déjà vu !~��as⃝df̅ 

as bytes, stripped of control codes and extended characters:
dj vu !~asdf 

as UTF-8, stripped of control codes:
déjà vu !~��as⃝df̅ 

as UTF-8, stripped of control codes and extended characters:
dj vu !~asdf

Icon and Unicon

We'll use deletec to remove unwanted characters (2nd argument) from a string (1st argument). The procedure below coerces types back and forth between string and cset. The character set of unwanted characters is the difference of all ASCII characters and the ASCII characters from 33 to 126. <lang Icon>procedure main(A) write(image(deletec(&ascii,&ascii--(&ascii)[33:127]))) end link strings </lang>

strings.icn provides deletec

The IPL procedure deletec is equivalent to this: <lang Icon>procedure deletec(s, c) #: delete characters

  result := ""
  s ? {
     while  result ||:= tab(upto(c)) do tab(many(c))
     return result ||:= tab(0)
     }

end</lang>


Output:

" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}"

J

Solution: <lang j>stripControlCodes=: -.&(DEL,32{.a.) stripControlExtCodes=: ([ -. -.)&(32}.127{.a.)</lang> Usage: <lang j> mystring=: a. {~ ?~256 NB. ascii chars 0-255 in random order

  #mystring                       NB. length of string

256

  #stripControlCodes mystring     NB. length of string without control codes

223

  #stripControlExtCodes mystring  NB. length of string without control codes or extended chars

95

  #myunicodestring=: u: ?~1000     NB. unicode characters 0-999 in random order

1000

  #stripControlCodes myunicodestring

967

  #stripControlExtCodes myunicodestring

95

  stripControlExtCodes myunicodestring

k}w:]U3xEh9"GZdr/#^B.Sn%\uFOo[(`t2-J6*IA=Vf&N;lQ8,${XLz5?D0~s)'Y7Kq|ip4<WRCaM!b@cgv_T +mH>1ejPy</lang>

Liberty BASIC

<lang lb>

   all$ =""
   for i =0 to 255
       all$ =all$ +chr$( i)
   next i
   print "Original string of bytes.  ( chr$( 10) causes a CRLF.)"
   print all$
   print
   lessControl$ =controlStripped$( all$)
   print "With control codes stripped out."
   print lessControl$
   print
   lessExtendedAndControl$ =extendedStripped$( lessControl$)
   print "With extended codes stripped out too."
   print lessExtendedAndControl$
   end
   function controlStripped$( i$)
       r$ =""
       for j =1 to len( i$)
           ch$ =mid$( i$, j, 1)
           if asc( ch$) >=32 then r$ =r$ +ch$
       next j
       controlStripped$ =r$
   end function
   function extendedStripped$( i$)
       r$ =""
       for j =1 to len( i$)
           ch$ =mid$( i$, j, 1)
           if asc( ch$) <=128 then r$ =r$ +ch$
       next j
       extendedStripped$ =r$
   end function

</lang>

Lua

<lang lua>function Strip_Control_Codes( str )

   local s = ""
   for i in str:gmatch( "%C+" ) do
	s = s .. i
   end
   return s

end

function Strip_Control_and_Extended_Codes( str )

   local s = ""
   for i = 1, str:len() do

if str:byte(i) >= 32 and str:byte(i) <= 126 then

 	    s = s .. str:sub(i,i)

end

   end
   return s

end

q = "" for i = 0, 255 do q = q .. string.char(i) end

print( Strip_Control_Codes(q) ) print( Strip_Control_and_Extended_Codes(q) )</lang>

 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~

OCaml

<lang ocaml>let is_control_code c =

 let d = int_of_char c in
 d < 32 || d = 127

let is_extended_char c =

 let d = int_of_char c in
 d > 127

let strip f str =

 let len = String.length str in
 let res = String.create len in
 let rec aux i j =
   if i >= len then String.sub res 0 j else
   if f str.[i]
   then aux (succ i) j
   else begin
     res.[j] <- str.[i];
     aux (succ i) (succ j)
   end
 in
 aux 0 0

let () =

 let len = 32 in
 let s = String.create len in
 Random.self_init();
 for i = 0 to pred len do
   s.[i] <- char_of_int (Random.int 256)
 done;
 print_endline (strip is_control_code s);
 print_endline (strip (fun c -> (is_control_code c) || (is_extended_char c)) s);
</lang>

Perl

<lang Perl>#!/usr/bin/perl -w use strict ;

my @letters ; my @nocontrols ; my @noextended ; for ( 1..40 ) {

  push @letters ,  int( rand( 256 ) ) ;

} print "before sanitation : " ; print join( , map { chr( $_ ) } @letters ) ; print "\n" ; @nocontrols = grep { $_ > 32 && $_ != 127 } @letters ; print "Without controls: " ; print join( , map { chr( $_ ) } @nocontrols ) ; @noextended = grep { $_ < 127 } @nocontrols ; print "\nWithout extended: " ; print join( , map { chr( $_ ) } @noextended ) ; print "\n" ;</lang> Output:

before sanitation : �L08&YH�O��n)�:���O�G$���.���"zO���Q�?��
Without controls: �L08&YH�O��n)�:�O�G$���.���"zO��Q�?��
Without extended: L08&YHOn):OG$."zOQ?

Perl 6

<lang perl6>my $str = (0..400).roll(80)».chr.join;

say $str; say $str.subst(/<[ ^@..^_ ]>/, , :g); say $str.subst(/<-[ \ ..~ ]>/, , :g);</lang>

�¶ØèúđkƌĘ�r­=êıƏÄÙÍy1SGa%TÑ�ęMŒRŅ�EŧİÌŬ–ńĩµ9ŒďĔÜÉĈĬz‰ijdś5FúŨƏźƅíýÛÃņGÏ
                                                                      ö~šƀ‹RÑú›
¶ØèúđkƌĘr­=êıƏÄÙÍy1SGa%TÑęMŒRŅEŧİÌŬ–ńĩµ9ŒďĔÜÉĈĬz‰ijdś5FúŨƏźƅíýÛÃņGÏö~šƀ‹RÑú›
kr=y1SGa%TMRE9zd5FG~R

PicoLisp

Control characters in strings are written with a hat (^) in PicoLisp. ^? is the DEL character. <lang PicoLisp>(de stripCtrl (Str)

  (pack
     (filter
        '((C)
           (nor (= "^?" C) (> " " C "^A")) )
        (chop Str) ) ) )

(de stripCtrlExt (Str)

  (pack
     (filter
        '((C) (> "^?" C "^_"))
        (chop Str) ) ) )</lang>

Test:

: (char "^?")
-> 127

: (char "^_")
-> 31

: (stripCtrl "^I^M a b c^? d äöüß")
-> " a b c d äöüß"

: (stripCtrlExt "^I^M a b c^? d äöüß")
-> " a b c d "

PureBasic

<lang PureBasic>Procedure.s stripControlCodes(source.s)

 Protected i, *ptrChar.Character, length = Len(source), result.s
 *ptrChar = @source
 For i = 1 To length
   If *ptrChar\c > 31 
     result + Chr(*ptrChar\c)
   EndIf
   *ptrChar + SizeOf(Character)
 Next
 ProcedureReturn result 

EndProcedure

Procedure.s stripControlExtCodes(source.s)

 Protected i, *ptrChar.Character, length = Len(source), result.s
 *ptrChar = @source
 For i = 1 To length
   If *ptrChar\c > 31 And *ptrChar\c < 128
     result + Chr(*ptrChar\c)
   EndIf
   *ptrChar + SizeOf(Character)
 Next
 ProcedureReturn result 

EndProcedure

If OpenConsole()

 ;create sample string
 Define i, s.s
 For i = 1 To 80
   s + Chr(Random(254) + 1) ;include character values from 1 to 255
 Next 
 PrintN(stripControlCodes(s))    ;string without control codes 
 PrintN("---------")
 PrintN(stripControlExtCodes(s)) ;string without control codes or extended chars
 
 Print(#CRLF$ + #CRLF$ + "Press ENTER to exit"): Input()
 CloseConsole()

EndIf</lang> Sample output:

»╫=┐C─≡G(═ç╤â√╝÷╔¬ÿ▌x  è4∞|)ï└⌐ƒ9²òτ┌ºáj)▓<~-vPÿφQ╨ù¿╖îFh"[ü╗dÉ₧q#óé├p╫■
---------
=CG(x 4|)9j)<~-vPQFh"[dq#p

Python

<lang Python>def stripped(x): return "".join([i for i in x if ord(i) in range(32, 127)])

print stripped("\ba\x00b\n\rc\fd\xc3")</lang>Output:<lang>abcd</lang>

Seed7

Seed7 strings are UTF-32 encoded, therefore no destinction between BYTE and Unicode strings is necessary. The example below uses STD_UTF8_OUT from the library utf8.s7i, to write Unicode characters with UTF-8 encoding to the console.

<lang seed7>$ include "seed7_05.s7i";

 include "utf8.s7i";

const func string: stripControl (in string: stri) is func

 result
   var string: stripped is "";
 local
   var integer: old_pos is 1;
   var integer: index is 0;
   var char: ch is ' ';
 begin
   for ch key index range stri do
     if ch < ' ' or ch = '\127\' then
       stripped &:= stri[old_pos .. pred(index)];
       old_pos := succ(index);
     end if;
   end for;
   stripped &:= stri[old_pos ..];
 end func;

const func string: stripControlAndExtended (in string: stri) is func

 result
   var string: stripped is "";
 local
   var integer: old_pos is 1;
   var integer: index is 0;
   var char: ch is ' ';
 begin
   for ch key index range stri do
     if ch < ' ' or ch >= '\127\' then
       stripped &:= stri[old_pos .. pred(index)];
       old_pos := succ(index);
     end if;
   end for;
   stripped &:= stri[old_pos ..];
 end func;

const string: src is "déjà vu\ # Unicode

   \\n\0\\31\ \33\\126\\127\\128\\255\\n\  # Various boundary cases
   \as⃝df̅";                                 # Unicode combining characters

const proc: main is func

 begin
   OUT := STD_UTF8_OUT;
   writeln("source text:");
   writeln(src);
   writeln("Stripped of control codes:");
   writeln(stripControl(src));
   writeln("Stripped of control codes and extended characters:");
   writeln(stripControlAndExtended(src));
 end func;</lang>

Output:

source text:
déjà vu
� !~?€ÿ
as⃝df̅
Stripped of control codes:
déjà vu !~€ÿas⃝df̅
Stripped of control codes and extended characters:
dj vu !~asdf

Tcl

<lang tcl>proc stripAsciiCC str {

   regsub -all {[\u0000-\u001f\u007f]+} $str ""

} proc stripCC str {

   regsub -all {[^\u0020-\u007e]+} $str ""

}</lang>