Rosetta Code/Find bare lang tags: Difference between revisions

From Rosetta Code
Content added Content deleted
m (→‎{{header|Ruby}}: takes 2 params, not three)
Line 392:
part_uri = ""
Report =, :tasks)
result ={|h,k,v| h[k] =, [])}
tasks.each do |task|
Line 425:
1 in PHP (["Greatest_subsequential_sum"])
For all the extra credit (note, takes a substantial amount of time due to number of HTTP requests):

Revision as of 21:45, 2 March 2014

Rosetta Code/Find bare lang tags
You are encouraged to solve this task according to the task description, using any language you may know.

Find all <lang> tags without a language specified in the text of a page. Display counts by language section:



<lang C>printf("Hello world!\n");</lang>

<lang>print "Hello world!\n"</lang>

should display something like

2 bare language tags.

1 in perl
1 in no language

For extra credit, allow multiple files to be read. Summarize all results by language:

5 bare language tags.

2 in c ([[Foo]], [[Bar]])
1 in perl ([[Foo]])
2 in no language ([[Baz]])

For more extra credit, use the Media Wiki API to test actual RC tasks.


This code has no syntax highlighting, because Rosetta Code's highlighter fails with code that contains literal </lang> tags.

Stole RegEx Needle from Perl

task =


<lang C>printf("Hello world!\n");</lang>

<lang>print "Hello world!\n"</lang>
lang := "no language", out := Object(lang, 0), total := 0
Loop Parse, task, `r`n
	If RegExMatch(A_LoopField, "==\s*{{\s*header\s*\|\s*([^\s\}]+)\s*}}\s*==", $)
		lang := $1, out[lang] := 0
	else if InStr(A_LoopField, "<lang>")
For lang, num in Out
	If num
		total++, str .= "`n" num " in " lang
MsgBox % clipboard := total " bare lang tags.`n" . str


2 bare lang tags.

1 in no language
1 in Perl


<lang Erlang> -module( find_bare_lang_tags ).

-export( [task/0] ).

task() -> {ok, Binary} = file:read_file( "priv/find_bare_lang_tags_1" ), Lines = string:tokens( erlang:binary_to_list(Binary), "\n" ), {_Lang, Dict} = lists:foldl( fun count_empty_lang/2, {"no language", dict:new()}, Lines ), Count_langs = [{dict:fetch(X, Dict), X} || X <- dict:fetch_keys(Dict)], io:fwrite( "~p bare language tags.~n", [lists:sum([X || {X, _Y} <- Count_langs])] ), [io:fwrite( "~p in ~p~n", [X, Y] ) || {X, Y} <- Count_langs].

count_empty_lang( Line, {Lang, Dict} ) -> Empty_lang = string:str( Line, "<lang>" ), New_dict = dict_update_counter( Empty_lang, Lang, Dict ), New_lang = new_lang( string:str( Line,"==[[:Category:{{{1}}}|{{{1}}}]] [[Category:{{{1}}}]] Property "Implemented in language" (as page type) with input value "{{{1}}}" contains invalid characters or is incomplete and therefore can cause unexpected results during a query or annotation process.==" ), string:sub_string( Line, Start+1, Stop-1 ). </lang>

60>  find_bare_lang_tags:task().
2 bare language tags.
1 in "no language"
1 in "Perl"


There are actually many different Regex packages available for Haskell. For this example, I chose TDFA, a very fast POSIX ERE engine. To change engines, simply change the import statement. If you use a Perl-style RE engine, you'll have to modify the expressions slightly.

This solution can be compiled into a program that will either take space-delimited list of files as its argument, or take input from STDIN if no arguments are provided. Additionally, if you specify the -w flag in the first argument, it will take a list of Rosetta Code wiki pages and search them. Note that the page names must be as they appear in your URL bar -- underscores in place of spaces.

<lang Haskell>import System.Environment import Network.HTTP import Text.Printf import Text.Regex.TDFA import Data.List import Data.Array import qualified Data.Map as Map

{-| Takes a string and cuts out the text matched in the MatchText array. -} splitByMatches :: String -> [MatchText String] -> [String] splitByMatches str matches = foldr splitHead [str] matches

   where splitHead match acc  =  before:after:(tail acc)
           where before  =  take (matchOffset).head$ acc
                 after  =  drop (matchOffset + matchLen).head$ acc
                 matchOffset  =  fst.snd.(!0)$ match
                 matchLen  =  snd.snd.(!0)$ match

{-| Takes a string and counts the number of time a valid, but bare, lang tag

   appears.  It does not attempt to ignore valid tags inside lang blocks.  -}

countBareLangTags :: String -> Int countBareLangTags = matchCount (makeRegex "<langspace:*>" :: Regex)

{-| Takes a string and counts the number of bare lang tags per section of the

   text.  All tags before the first section are put into the key "".  -}

countByLanguage :: String -> Map.Map String Int countByLanguage str = Map.fromList.filter ((>0).snd)$ zip langs counts

   where counts  =  map countBareLangTags.splitByMatches str$ allMatches
         langs  =  "":(map (fst.(!1)) allMatches)
         allMatches  =  matchAllText (makeRegex headerRegex :: Regex) str
         headerRegex  =  "==space:*{{space:*headerspace:*\\|space:*([^ }]*)space:*}}[^=]*=="

main = do

   args <- getArgs
   (contents, files) <- if length args == 0 then do
       -- If there aren't arguments, read from stdin
           content  <-  getContents
           return ([content],[""])
       else if length args == 1 then do
       -- If there's only one argument, read the file, but don't display
       -- the filename in the results.
           content  <-  readFile (head args)
           return ([content],[""])
       else if (args !! 0) == "-w" then do
       -- If there's more than one argument and the first one is the -w option,
       -- use the rest of the arguments as page titles and load them from the wiki.
           contents  <-  mapM getPageContent.tail$ args
           return (contents, if length args > 2 then tail args else [""])
       else do
       -- Otherwise, read all the files and display their file names.
           contents  <-  mapM readFile args
           return (contents, args)
   let tagsPerLang  =  map countByLanguage contents
   let tagsWithFiles  =  zipWith addFileToTags files tagsPerLang
   let combinedFiles  =  Map.unionsWith combine tagsWithFiles
   printBareTags combinedFiles
       where addFileToTags file  = (flip (,) [file])
             combine cur next  =  (fst cur + fst next, snd cur ++ snd next)

printBareTags :: Map.Map String (Int,[String]) -> IO () printBareTags tags = do

   let numBare  =  Map.foldr ((+).fst) 0 tags
   printf "%d bare language tags:\n\n" numBare
   mapM_ (\(lang,(count,files)) ->
       printf "%d in %s%s\n" count
                             (if lang == "" then "no language" else lang)
                             (filesString files)
       ) (Map.toAscList tags)

filesString :: [String] -> String filesString [] = "" filesString ("":rest) = filesString rest filesString files = " ("++listString files++")"

   where listString [file]  =  ""++file++""
         listString (file:files)  =  ""++file++", "++listString files

getPageContent :: String -> IO String getPageContent title = do

   response  <-  simpleHTTP.getRequest$ url
   getResponseBody response
       where url  =  ""++title</lang>

Here are the input files I used to test:


<lang C>printf("Hello world!\n");</lang>

<lang>print "Hello world!\n"</lang>


<lang>printf("Hello world!\n");</lang>

<lang>print "Hello world!\n"</lang>
<lang Perl>print "Goodbye world!\n"</lang>

<lang>hubris lang = "I'm so much better than a "++lang++" programmer because I program in Haskell."</lang>

And the output:

6 bare language tags:

2 in no language ([[]], [[]])
1 in C ([[]])
1 in Haskell ([[]])
2 in Perl ([[]], [[]])

Additionally, I tested with 100_doors and Huffman_coding. The following resulted:

5 bare language tags:

1 in no language ([[100_doors]])
1 in C ([[Huffman_coding]])
1 in CoffeeScript ([[Huffman_coding]])
1 in Perl ([[Huffman_coding]])
1 in PostScript ([[100_doors]])

Icon and Unicon

The following is a Unicon-specific solution. <lang unicon>import Utils # To get the FindFirst class

procedure main()

   keys := ["{{header|","<lang>"]
   lang := "No language"
   tags := table(0)
   total := 0
   ff := FindFirst(keys)
   f := reads(&input, -1)
   f ? while tab(ff.locate()) do {
       if "[[:Category:{{{1}}}|{{{1}}}]] [[Category:{{{1}}}]] Property "Implemented in language" (as page type) with input value "{{{1}}}" contains invalid characters or is incomplete and therefore can cause unexpected results during a query or annotation process.")))
       else (tags[lang] +:= 1, total +:= 1)
   write(total," bare language tags:\n")
   every pair := !sort(tags) do write(pair[2]," in ",pair[1])


Sample run using example given in problem statement:

->rcfblt <
2 bare language tags:

1 in No language
1 in perl


This is a simple implementation that does not attempt either extra credit. <lang perl>my $lang = 'no language'; my $total = 0; my %blanks = (); while (<>) {

 if (m/<lang>/) {
   if (exists $blanks{lc $lang}) {
     $blanks{lc $lang}++
   } else {
     $blanks{lc $lang} = 1
 } elsif (m/==\s*Template:\s*header\s*\\s*==/) {
   $lang = lc $1


if ($total) { print "$total bare language tag" . ($total > 1 ? 's' : ) . ".\n\n"; while ( my ($k, $v) = each(%blanks) ) { print "$k in $v\n" } }</lang>

Perl 6

Translation of: Perl

The only tricky thing here is the use of the ms form of match, short for m:sigspace. This causes whitespace in the regex to be considered "significant", that is, it matches optional whitespace at those positions, as if you'd put \s* there. Of course, the regexes themselves are in Perl 6 syntax, which is quite different from Perl 5 regex syntax (and arguably much cleaner). Regex syntax is perhaps the area in which Perl 6 diverges most from Perl 5. <lang perl6>my $lang = '(no language)'; my $total = 0; my %blanks;

for lines() {

 when / '<lang>' / {
 when ms/ '==' 'Template:' 'header' '' '==' / {
   $lang = $;


say "$total bare language tag{ 's' if $total != 1 }\n"; say .value, ' in ', .key for %blanks.sort;</lang>

2 bare language tags

1 in (no language)
1 in perl


Note that this follows the task, but the output is completely bogus since the actual <lang> tags that it finds are in <pre> and in code...

<lang racket>

  1. lang racket

(require net/url net/uri-codec json)

(define (get-text page)

 (define ((get k) x) (dict-ref x k))
 ((compose1 (get '*) car (get 'revisions) cdar hash->list (get 'pages)
            (get 'query) read-json get-pure-port string->url format)
   `([titles . ,page] [prop . "revisions"] [rvprop . "content"]
     [format . "json"] [action . "query"]))))

(define (find-bare-tags page)

 (define in (open-input-string (get-text page)))
 (define rx
   ((compose1 pregexp string-append)
 (let loop ([lang "no language"] [bare '()])
   (match (regexp-match rx in)
     [(list _ #f) (loop lang (dict-update bare lang add1 0))]
     [(list _ lang) (loop lang bare)]
     [#f (if (null? bare) (printf "no bare language tags\n")
             (begin (printf "~a bare language tags\n" (apply + (map cdr bare)))
                    (for ([b bare]) (printf "  ~a in ~a\n" (cdr b) (car b)))))])))

(find-bare-tags "Rosetta Code/Find bare lang tags") </lang>

8 bare language tags
  2 in no language
  4 in Perl
  1 in AutoHotkey
  1 in Tcl

More-extra credit

Add the following code at the bottom, run, watch results. <lang racket> (define (get-category cat)

 (let loop ([c #f])
   (define t
     ((compose1 read-json get-pure-port string->url format)
       `([list . "categorymembers"] [cmtitle . ,(format "Category:~a" cat)]
         [cmcontinue . ,(and c (dict-ref c 'cmcontinue))]
         [cmlimit . "500"] [format . "json"] [action . "query"]))))
   (define (c-m key) (dict-ref (dict-ref t key '()) 'categorymembers #f))
   (append (for/list ([page (c-m 'query)]) (dict-ref page 'title))
           (cond [(c-m 'query-continue) => loop] [else '()]))))

(for ([page (get-category "Programming Tasks")])

 (printf "Page: ~a " page)
 (find-bare-tags page))



Quoting from the FAQ: "If you just want the raw wikitext without any other information whatsoever, it's best to use index.php's action=raw mode instead of the API" <lang Ruby>require "open-uri" require "cgi"

tasks = ["Greatest_common_divisor", "Greatest_element_of_a_list", "Greatest_subsequential_sum"] part_uri = "" Report =, :tasks) result ={|h,k| h[k] =, [])}

tasks.each do |task|

 puts "processing #{task}"
 current_lang = "no language"
 open(part_uri + CGI.escape(task)).each_line do |line|
   current_lang = Regexp.last_match["lang"] if /==\{\{header\|(?<lang>.+)\}\}==/ =~ line 
   num_no_langs = line.scan(/<lang\s*>/).size
   if num_no_langs > 0 then
     result[current_lang].count += num_no_langs
     result[current_lang].tasks << task


puts "\n#{} bare language tags.\n\n" result.each{|k,v| puts "#{v.count} in #{k} (#{v.tasks})"}</lang>

processing Greatest_common_divisor
processing Greatest_element_of_a_list
processing Greatest_subsequential_sum

10 bare language tags.

2 in Euler Math Toolbox (["Greatest_common_divisor", "Greatest_element_of_a_list"])
1 in gnuplot (["Greatest_common_divisor"])
1 in Bracmat (["Greatest_element_of_a_list"])
2 in МК-61/52 (["Greatest_element_of_a_list", "Greatest_element_of_a_list"])
1 in ooRexx (["Greatest_element_of_a_list"])
2 in Mathprog (["Greatest_subsequential_sum", "Greatest_subsequential_sum"])
1 in PHP (["Greatest_subsequential_sum"])


For all the extra credit (note, takes a substantial amount of time due to number of HTTP requests):

Library: Tcllib (Package: json)
Library: Tcllib (Package: textutil::split)
Library: Tcllib (Package: uri)

<lang tcl>package require Tcl 8.5 package require http package require json package require textutil::split package require uri

proc getUrlWithRedirect {base args} {

   set url $base?[http::formatQuery {*}$args]
   while 1 {

set t [http::geturl $url] if {[http::status $t] ne "ok"} { error "Oops: url=$url\nstatus=$s\nhttp code=[http::code $token]" } if {[string match 2?? [http::ncode $t]]} { return $t } # OK, but not 200? Must be a redirect... set url [uri::resolve $url [dict get [http::meta $t] Location]] http::cleanup $t



proc get_tasks {category} {

   global cache
   if {[info exists cache($category)]} {

return $cache($category)

   set query [dict create cmtitle Category:$category]
   set tasks [list]

   while {1} {

set response [getUrlWithRedirect \ action query list categorymembers format json cmlimit 500 {*}$query]

# Get the data out of the message

       set data [json::json2dict [http::data $response]]
       http::cleanup $response

       # add tasks to list
       foreach task [dict get $data query categorymembers] {
           lappend tasks [dict get [dict create {*}$task] title]

       if {[catch {

dict get $data query-continue categorymembers cmcontinue } continue_task]} then {

           # no more continuations, we're done
       dict set query cmcontinue $continue_task
   return [set cache($category) $tasks]

} proc getTaskContent task {

   set token [getUrlWithRedirect \

title $task action raw]

   set content [http::data $token]
   http::cleanup $token
   return $content


proc init {} {

   global total count found
   set total 0
   array set count {}
   array set found {}

} proc findBareTags {pageName pageContent} {

   global total count found
   set t {{}}
   lappend t {*}[textutil::split::splitx $pageContent \


   foreach {sectionName sectionText} $t {

set n [regexp -all {<lang>} $sectionText] if {!$n} continue incr count($sectionName) $n lappend found($sectionName) $pageName incr total $n


} proc printResults {} {

   global total count found
   puts "$total bare language tags."
   if {$total} {

puts "" if {[info exists found()]} { puts "$count() in task descriptions\ (\[\[[join $found() {]], [[}]\]\])" unset found() } foreach sectionName [lsort -dictionary [array names found]] { puts "$count($sectionName) in $sectionName\ (\[\[[join $found($sectionName) {]], [[}]\]\])" }



init set tasks [get_tasks Programming_Tasks]

  1. puts stderr "querying over [llength $tasks] tasks..."

foreach task [get_tasks Programming_Tasks] {

   #puts stderr "$task..."
   findBareTags $task [getTaskContent $task]

} printResults</lang>