Strip block comments: Difference between revisions

Content added Content deleted

Inline

Revision as of 02:46, 13 April 2011

A block comment begins with a beginning delimiter and ends with a ending delimiter, including the delimiters. These delimiters are often multi-character sequences.

Task: Strip block comments from program text (of a programming language much like classic C). Your demos should at least handle simple, non-nested and multiline block comment delimiters. The beginning delimiter is the two-character sequence “/*” and the ending delimiter is “*/”.

Sample text for stripping:

  /**
   * Some comments
   * longer comments here that we can parse.
   *
   * Rahoo 
   */
   function subroutine() {
    a = /* inline comment */ b + c ;
   }
   /*/ <-- tricky comments */

   /**
    * Another comment.
    */
    function something() {
    }

Extra credit: Ensure that the stripping code is not hard-coded to the particular delimiters described above, but instead allows the caller to specify them. (If your language supports them, optional parameters may be useful for this.)

C.f: Strip comments from a string

C

<lang C>#include<stdio.h>

include<string.h>
include<stdlib.h>

char*ca="/*",*cb="*/"; int al=2,bl=2;

void pause(){

puts("Press Enter");
getchar();

}

char*loadfile(char*s){

FILE*f;
int l;
if(f=fopen(s,"rb")){
 fseek(f,0,SEEK_END);
 s=malloc((l=ftell(f))+1);
 rewind(f);
 if(s)fread(s,1,l,f);
 fclose(f);
}
return s;

}

void stripcomments(char*s){

char*a,*b;
int l=strlen(s)+1;
while(a=strstr(s,ca)){
 b=strstr(a+al,cb);
 if(!b)break;
 b+=bl;
 memmove(a,b,l-(b-a));
}

}

int main(int argc,char**argv){

char*s="input.txt";
if(argc>=2)s=argv[1];
s=loadfile(s);
if(argc==4){
 al=strlen(ca=argv[2]);
 bl=strlen(cb=argv[3]);
}
stripcomments(s);
puts(s);
pause();
return 0;

} </lang> Usage: Specify an input file via the first command line argument, and optionally specify comment opening and closing delimiters with the next two args, or defaults of /* and */ are assumed.

Output:


   function subroutine() {
    a =  b + c ;
   }



    function something() {
    }

Press Enter

C++

<lang C++>#include <string>

include <iostream>
include <iterator>
include <fstream>
include <boost/regex.hpp>

int main( ) {

   std::ifstream codeFile( "samplecode.txt" ) ;
   if ( codeFile ) {
      boost::regex commentre( "/\\*.*?\\*/" ) ;//comment start and end, and as few characters in between as possible
      std::string my_erase( "" ) ;             //erase them
      std::string stripped ;
      std::string code( (std::istreambuf_iterator<char>( codeFile ) ) ,

std::istreambuf_iterator<char>( ) ) ;

      codeFile.close( ) ;
      stripped = boost::regex_replace( code , commentre , my_erase ) ;
      std::cout << "Code unstripped:\n" << stripped << std::endl ;
      return 0 ;
   }
   else {
      std::cout << "Could not find code file!" << std::endl ;
      return 1 ;
   }

}</lang> Output:

Code unstripped:

   function subroutine() {
    a =  b + c ;
   }
   

   
    function something() {
    }

Clojure

<lang Clojure>(defn comment-strip [txt & args]

 (let [args (conj {:delim ["/*" "*/"]} (apply hash-map args)) ; This is the standard way of doing keyword/optional arguments in Clojure

[opener closer] (:delim args)]

   (loop [out "", txt txt, delim-count 0] ; delim-count is needed to handle nested comments
     (let [[hdtxt resttxt] (split-at (count opener) txt)] ; This splits "/* blah blah */" into hdtxt="/*" and restxt="blah blah */"

(printf "hdtxt=%8s resttxt=%8s out=%8s txt=%16s delim-count=%s\n" (apply str hdtxt) (apply str resttxt) out (apply str txt) delim-count) (cond (empty? hdtxt) (str out (apply str txt)) (= (apply str hdtxt) opener) (recur out resttxt (inc delim-count)) (= (apply str hdtxt) closer) (recur out resttxt (dec delim-count)) (= delim-count 0)(recur (str out (first txt)) (rest txt) delim-count) true (recur out (rest txt) delim-count))))))</lang>

user> (comment-strip "This /* is */ some /* /* /* */ funny */ */ text")
hdtxt=      Th resttxt=is /* is */ some /* /* /* */ funny */ */ text out=         txt=This /* is */ some /* /* /* */ funny */ */ text delim-count=0
hdtxt=      hi resttxt=s /* is */ some /* /* /* */ funny */ */ text out=       T txt=his /* is */ some /* /* /* */ funny */ */ text delim-count=0
hdtxt=      is resttxt= /* is */ some /* /* /* */ funny */ */ text out=      Th txt=is /* is */ some /* /* /* */ funny */ */ text delim-count=0
hdtxt=      s  resttxt=/* is */ some /* /* /* */ funny */ */ text out=     Thi txt=s /* is */ some /* /* /* */ funny */ */ text delim-count=0
hdtxt=       / resttxt=* is */ some /* /* /* */ funny */ */ text out=    This txt= /* is */ some /* /* /* */ funny */ */ text delim-count=0
hdtxt=      /* resttxt= is */ some /* /* /* */ funny */ */ text out=   This  txt=/* is */ some /* /* /* */ funny */ */ text delim-count=0
hdtxt=       i resttxt=s */ some /* /* /* */ funny */ */ text out=   This  txt= is */ some /* /* /* */ funny */ */ text delim-count=1
hdtxt=      is resttxt= */ some /* /* /* */ funny */ */ text out=   This  txt=is */ some /* /* /* */ funny */ */ text delim-count=1
hdtxt=      s  resttxt=*/ some /* /* /* */ funny */ */ text out=   This  txt=s */ some /* /* /* */ funny */ */ text delim-count=1
hdtxt=       * resttxt=/ some /* /* /* */ funny */ */ text out=   This  txt= */ some /* /* /* */ funny */ */ text delim-count=1
hdtxt=      */ resttxt= some /* /* /* */ funny */ */ text out=   This  txt=*/ some /* /* /* */ funny */ */ text delim-count=1
hdtxt=       s resttxt=ome /* /* /* */ funny */ */ text out=   This  txt= some /* /* /* */ funny */ */ text delim-count=0
hdtxt=      so resttxt=me /* /* /* */ funny */ */ text out=  This   txt=some /* /* /* */ funny */ */ text delim-count=0
hdtxt=      om resttxt=e /* /* /* */ funny */ */ text out= This  s txt=ome /* /* /* */ funny */ */ text delim-count=0
hdtxt=      me resttxt= /* /* /* */ funny */ */ text out=This  so txt=me /* /* /* */ funny */ */ text delim-count=0
hdtxt=      e  resttxt=/* /* /* */ funny */ */ text out=This  som txt=e /* /* /* */ funny */ */ text delim-count=0
hdtxt=       / resttxt=* /* /* */ funny */ */ text out=This  some txt= /* /* /* */ funny */ */ text delim-count=0
hdtxt=      /* resttxt= /* /* */ funny */ */ text out=This  some  txt=/* /* /* */ funny */ */ text delim-count=0
hdtxt=       / resttxt=* /* */ funny */ */ text out=This  some  txt= /* /* */ funny */ */ text delim-count=1
hdtxt=      /* resttxt= /* */ funny */ */ text out=This  some  txt=/* /* */ funny */ */ text delim-count=1
hdtxt=       / resttxt=* */ funny */ */ text out=This  some  txt= /* */ funny */ */ text delim-count=2
hdtxt=      /* resttxt= */ funny */ */ text out=This  some  txt=/* */ funny */ */ text delim-count=2
hdtxt=       * resttxt=/ funny */ */ text out=This  some  txt= */ funny */ */ text delim-count=3
hdtxt=      */ resttxt= funny */ */ text out=This  some  txt=*/ funny */ */ text delim-count=3
hdtxt=       f resttxt=unny */ */ text out=This  some  txt= funny */ */ text delim-count=2
hdtxt=      fu resttxt=nny */ */ text out=This  some  txt=funny */ */ text delim-count=2
hdtxt=      un resttxt=ny */ */ text out=This  some  txt= unny */ */ text delim-count=2
hdtxt=      nn resttxt=y */ */ text out=This  some  txt=  nny */ */ text delim-count=2
hdtxt=      ny resttxt= */ */ text out=This  some  txt=   ny */ */ text delim-count=2
hdtxt=      y  resttxt=*/ */ text out=This  some  txt=    y */ */ text delim-count=2
hdtxt=       * resttxt=/ */ text out=This  some  txt=      */ */ text delim-count=2
hdtxt=      */ resttxt= */ text out=This  some  txt=      */ */ text delim-count=2
hdtxt=       * resttxt=  / text out=This  some  txt=         */ text delim-count=1
hdtxt=      */ resttxt=    text out=This  some  txt=         */ text delim-count=1
hdtxt=       t resttxt=     ext out=This  some  txt=            text delim-count=0
hdtxt=      te resttxt=      xt out=This  some   txt=            text delim-count=0
hdtxt=      ex resttxt=       t out=This  some  t txt=             ext delim-count=0
hdtxt=      xt resttxt=         out=This  some  te txt=              xt delim-count=0
hdtxt=       t resttxt=         out=This  some  tex txt=               t delim-count=0
hdtxt=         resttxt=         out=This  some  text txt=                 delim-count=0
"This  some  text"

D

<lang d>import std.stdio ; import std.regexp, std.algorithm ;

string[] sepComment(string s, string cpat[] ...) {

   assert(cpat.length == 2,
       "sepComment : 2 pattern arguments for comment begin & end") ;
   string[] res = new string[](2) ;
   int p = 0, q = 0 /* cursors */, ic = 0 ;     // inside comment?
   int[] plen = new int[](2) ;                  // this's for handling /*/

   bool advCursor() {
       auto m = std.regexp.search(s[p..$], cpat[ic]) ;
       if(m is null) return false ;
       plen[ic] = max(0, plen[ic], m[0].length) ;
       q = p + m.pre.length ;                   // got comment head
       if(ic) { q += m[0].length  ;    }        // or comment tail
       if(std.regexp.find(m[0], "\n|\r") != -1) // special adjust for \n\r
           q-- ;
       return true ;
   }

   while(true) {
       if(!advCursor()) break ;
       res[ic] ~= s[p..q] ;                     // save slice of result
       if( ic && (q - p < plen[0] + plen[1])) { // this handle /*/ pattern
           p = q ;
           if(!advCursor()) break ;
           res[ic] ~= s[p..q] ;                 // save result again
       }
       p = q ;                                  // advance cursor
       ic = 1 - ic ;                            // toggle search type
   }
   if(ic)
       throw new Exception("Mismatched Comment") ;
   res[ic] ~= s[p..$] ;                         // save rest(non-comment)
   return res ;

}

void main() {

   string s = `  /**
  * Some comments
  * longer comments here that we can parse.
  *
  * Rahoo
  */
  function subroutine() {
   a = /* inline comment */ b + c ;
  }
  /*/ <-- tricky comments */

  /**
   * Another comment.
   */
   function something() {
   }` ;

   writefln("==original:\n%s", s) ;
   auto t = sepComment(s, `/\*`, `\*/`) ;
   writefln("==comment stripped:\n%s\n==stripped comment:\n%s", t[0], t[1]) ;

   s = "apples, pears # and bananas

apples, pears ; and bananas " ; // test for line comment

   writefln("==original:\n%s", s) ;
   t = sepComment(s, `#|;`, `[\n\r]|$`) ;
   writefln("==comment stripped:\n%s\n==stripped comment:\n%s", t[0], t[1]) ;

}</lang> part of output:

==comment stripped:

   function subroutine() {
    a =  b + c ;
   }



    function something() {
    }
==stripped comment:
/**
   * Some comments
   * longer comments here that we can parse.
   *
   * Rahoo
   *//* inline comment *//*/ <-- tricky comments *//**
    * Another comment.
    */

Go

For the extra credit: No optional parameters in Go, but documented below is an efficient technique for letting the caller specify the delimiters. <lang go>package main

import (

   "fmt"
   "strings"

)

// idiomatic to name a function newX that allocates an object, initializes it, // and returns it ready to use. the object in this case is a closure. func newStripper(start, end string) func(string) string {

   // default to c-style block comments
   if start == "" || end == "" {
       start, end = "/*", "*/"
   }
   // closes on variables start, end.
   return func(source string) string {
       for {
           cs := strings.Index(source, start)
           if cs < 0 {
               break
           }
           ce := strings.Index(source[cs+2:], end)
           if ce < 0 {
               break
           }
           source = source[:cs] + source[cs+ce+4:]
       }
       return source
   }

}

func main() {

   // idiomatic is that zero values indicate to use meaningful defaults
   stripC := newStripper("", "")

   // strip function now defined and can be called any number of times
   // without respecifying delimiters
   fmt.Println(stripC(`  /**
  * Some comments
  * longer comments here that we can parse.
  *
  * Rahoo
  */
  function subroutine() {
   a = /* inline comment */ b + c ;
  }
  /*/ <-- tricky comments */

  /**
   * Another comment.
   */
   function something() {
   }`))

}</lang>

J

<lang j>strip=:#~1 0 _1*./@:(|."0 1)2>4{"1(5;(0,"0~".;._2]0 :0);'/*'i.a.)&;:

)</lang>

Example data:

<lang j>example=: 0 :0

 /**
  * Some comments
  * longer comments here that we can parse.
  *
  * Rahoo 
  */
  function subroutine() {
   a = /* inline comment */ b + c ;
  }
  /*/ <-- tricky comments */

  /**
   * Another comment.
   */
   function something() {
   }

)</lang>

Example use:

<lang j> strip example

  function subroutine() {
   a =  b + c ;
  }

   function something() {
   }</lang>

Here is a version which allows the delimiters to be passed as an optional left argument as a pair of strings:

<lang j>stripp=:3 :0

 ('/*';'*/') stripp y

 'open close'=. x
 marks=. (+./(-i._1+#open,close)|."0 1 open E. y) - close E.&.|. y
 y #~  -. (+._1&|.) (1 <. 0 >. +)/\.&.|. marks

)</lang>

Lua

It is assumed, that the code is in the file "Text1.txt". <lang lua>filename = "Text1.txt"

fp = io.open( filename, "r" ) str = fp:read( "*all" ) fp:close()

stripped = string.gsub( str, "/%*.-%*/", "" ) print( stripped )</lang>

Perl

<lang Perl>#!/usr/bin/perl -w use strict ; use warnings ;

open( FH , "<" , "samplecode.txt" ) or die "Can't open file!$!\n" ; my $code = "" ; {

  local $/ ;
  $code = <FH> ; #slurp mode

} close FH ; $code =~ s,/\*.*?\*/,,sg ; print $code . "\n" ;</lang> Output:

function subroutine() {
    a =  b + c ;
   }
   

   
    function something() {
    }

Perl 6

<lang perl6>sample().split(/ '/*' .+? '*/' /).print;

sub sample { ' /**

   * Some comments
   * longer comments here that we can parse.
   *
   * Rahoo
   */
   function subroutine() {
    a = /* inline comment */ b + c ;
   }
   /*/ <-- tricky comments */

   /**
    * Another comment.
    */
   function something() {
   }

'}</lang>

Output:

   
    function subroutine() {
     a =  b + c ;
    }
    

    
    function something() {
    }

PicoLisp

<lang PicoLisp>(in "sample.txt"

  (while (echo "/*")
     (out "/dev/null" (echo "*/")) ) )</lang>

Output:


   function subroutine() {
    a =  b + c ;
   }
   

   
    function something() {
    }

PL/I

<lang PL/I> /* A program to remove comments from text. */ strip: procedure options (main); /* 8/1/2011 */

  declare text character (80) varying;
  declare (j, k) fixed binary;

  on endfile (sysin) stop;

  do forever;
     get edit (text) (L);
     do until (k = 0);
        k = index(text, '/*');
        if k > 0 then /* we have a start of comment. */
           do;
              /* Look for end of comment. */
              j = index(text, '*/', k+2);
              if j > 0 then
                 do;
                    text = substr(text, 1, k-1) ||
                           substr(text, j+2, length(text)-(j+2)+1);
                 end;
              else
                 do; /* The comment continues onto the next line. */
                    put skip list ( substr(text, 1, k-1) );

more: get edit (text) (L);

                    j = index(text, '*/');
                    if j = 0 then do; put skip; go to more; end;
                    text = substr(text, j+2, length(text) - (j+2) + 1);
                 end;
           end;
     end;
     put skip list (text);
  end;

end strip; </lang>

PureBasic

Solution using regular expressions. A procedure to stripBlocks() procedure is defined that will strip comments between any two delimeters. <lang PureBasic>Procedure.s escapeChars(text.s)

 Static specialChars.s = "[\^$.|?*+()"
 Protected output.s, nextChar.s, i, countChar = Len(text)
 For i = 1 To countChar
   nextChar = Mid(text, i, 1)
   If FindString(specialChars, nextChar, 1)
     output + "\" + nextChar
   Else
     output + nextChar
   EndIf 
 Next
 ProcedureReturn output

EndProcedure

Procedure.s stripBlocks(text.s, first.s, last.s)

 Protected delimter_1.s = escapeChars(first), delimter_2.s = escapeChars(last)
 Protected expNum = CreateRegularExpression(#PB_Any, delimter_1 + ".*?" + delimter_2, #PB_RegularExpression_DotAll)
 Protected output.s = ReplaceRegularExpression(expNum, text, "")
 FreeRegularExpression(expNum)
 ProcedureReturn output

EndProcedure

Define source.s source.s = " /**" + #CRLF$ source.s + " * Some comments" + #CRLF$ source.s + " * longer comments here that we can parse." + #CRLF$ source.s + " *" + #CRLF$ source.s + " * Rahoo " + #CRLF$ source.s + " */" + #CRLF$ source.s + " function subroutine() {" + #CRLF$ source.s + " a = /* inline comment */ b + c ;" + #CRLF$ source.s + " }" + #CRLF$ source.s + " /*/ <-- tricky comments */" + #CRLF$ source.s + "" + #CRLF$ source.s + " /**" + #CRLF$ source.s + " * Another comment." + #CRLF$ source.s + " */" + #CRLF$ source.s + " function something() {" + #CRLF$ source.s + " }" + #CRLF$

If OpenConsole()

 PrintN("--- source ---")
 PrintN(source)
 PrintN("--- source with block comments between '/*' and '*/' removed ---")
 PrintN(stripBlocks(source, "/*", "*/"))
 PrintN("--- source with block comments between '*' and '*' removed ---")
 PrintN(stripBlocks(source, "*", "*"))
  
 Print(#CRLF$ + #CRLF$ + "Press ENTER to exit"): Input()
 CloseConsole()

EndIf</lang> Sample output:

--- source ---
  /**
   * Some comments
   * longer comments here that we can parse.
   *
   * Rahoo
   */
   function subroutine() {
    a = /* inline comment */ b + c ;
   }
   /*/ <-- tricky comments */

   /**
    * Another comment.
    */
    function something() {
    }

--- source with block comments between '/*' and '*/' removed ---

   function subroutine() {
    a =  b + c ;
   }



    function something() {
    }

--- source with block comments between '*' and '*' removed ---
  /
    longer comments here that we can parse.
    Rahoo
    inline comment / <-- tricky comments  Another comment.
    */
    function something() {
    }

Python

The code has comment delimeters as an argument and will also strip nested block comments.

<lang python>def _commentstripper(txt, delim):

   'Strips first nest of block comments'
   
   deliml, delimr = delim
   out = 
   if deliml in txt:
       indx = txt.index(deliml)
       out += txt[:indx]
       txt = txt[indx+len(deliml):]
       txt = _commentstripper(txt, delim)
       assert delimr in txt, 'Cannot find closing comment delimiter in ' + txt
       indx = txt.index(delimr)
       out += txt[(indx+len(delimr)):]
   else:
       out = txt
   return out

def commentstripper(txt, delim=('/*', '*/')):

   'Strips nests of block comments'
   
   deliml, delimr = delim
   while deliml in txt:
       txt = _commentstripper(txt, delim)
   return txt</lang>

Tests and sample output <lang python>def test():

   print('\nNON-NESTED BLOCK COMMENT EXAMPLE:')
   sample =   /**
  * Some comments
  * longer comments here that we can parse.
  *
  * Rahoo 
  */
  function subroutine() {
   a = /* inline comment */ b + c ;
  }
  /*/ <-- tricky comments */

  /**
   * Another comment.
   */
   function something() {
   }
   print(commentstripper(sample))

   print('\nNESTED BLOCK COMMENT EXAMPLE:')
   sample =   /**
  * Some comments
  * longer comments here that we can parse.
  *
  * Rahoo 
  *//*
  function subroutine() {
   a = /* inline comment */ b + c ;
  }
  /*/ <-- tricky comments */
  */
  /**
   * Another comment.
   */
   function something() {
   }
   print(commentstripper(sample))

if __name__ == '__main__':

   test()</lang>

NON-NESTED BLOCK COMMENT EXAMPLE:
  
   function subroutine() {
    a =  b + c ;
   }
   

   
    function something() {
    }

NESTED BLOCK COMMENT EXAMPLE:
  
   
    function something() {
    }

Ruby

<lang ruby>def remove_comments!(str, comment_start='/*', comment_end='*/')

 while start_idx = str.index(comment_start) 
   end_idx = str.index(comment_end, start_idx + comment_start.length) + comment_end.length - 1
   str[start_idx .. end_idx] = "" 
 end
 str

end

def remove_comments(str, comment_start='/*', comment_end='*/')

 remove_comments!(str.dup, comment_start, comment_end)

end

example = <<END_OF_STRING

 /**
  * Some comments
  * longer comments here that we can parse.
  *
  * Rahoo 
  */
  function subroutine() {
   a = /* inline comment */ b + c ;
  }
  /*/ <-- tricky comments */

  /**
   * Another comment.
   */
   function something() {
   }

END_OF_STRING

puts remove_comments example</lang>

outputs

  
   function subroutine() {
    a =  b + c ;
   }
   

   
    function something() {
    }

Tcl

<lang tcl>proc stripBlockComment {string {openDelimiter "/*"} {closeDelimiter "*/"}} {

   # Convert the delimiters to REs by backslashing all non-alnum characters
   set openAsRE [regsub -all {\W} $openDelimiter {\\&}]
   set closeAsRE [regsub -all {\W} $closeDelimiter {\\&}]

   # Now remove the blocks using a dynamic non-greedy regular expression
   regsub -all "$openAsRE.*?$closeAsRE" $string ""

}</lang> Demonstration code: <lang tcl>puts [stripBlockComment " /**

  * Some comments
  * longer comments here that we can parse.
  *
  * Rahoo 
  */
  function subroutine() {
   a = /* inline comment */ b + c ;
  }
  /*/ <-- tricky comments */

  /**
   * Another comment.
   */
   function something() {
   }

"]</lang> Output:

  
   function subroutine() {
    a =  b + c ;
   }
   

   
    function something() {
    }