XML/Input: Difference between revisions

Content added Content deleted

Inline

Revision as of 18:08, 17 August 2009

Given the below XML fragment, extract the list of student names using whatever means desired. If the only viable method is to use XPath, refer the reader to the task XML and XPath.

 <Student Name="April" Gender="F" DateOfBirth="1989-01-02" />
 <Student Name="Bob" Gender="M"  DateOfBirth="1990-03-04" />
 <Student Name="Chad" Gender="M"  DateOfBirth="1991-05-06" />
 <Student Name="Dave" Gender="M"  DateOfBirth="1992-07-08">
   <Pet Type="dog" Name="Rover" />
 </Student>
 <Student DateOfBirth="1993-09-10" Gender="F" Name="Émily" />

</Students></lang>

Expected Output

April
Bob
Chad
Dave
Émily

AutoHotkey

simply using regular expressions <lang AutoHotkey>students = ( <Students>

 <Student Name="April" Gender="F" DateOfBirth="1989-01-02" />
 <Student Name="Bob" Gender="M"  DateOfBirth="1990-03-04" />
 <Student Name="Chad" Gender="M"  DateOfBirth="1991-05-06" />
 <Student Name="Dave" Gender="M"  DateOfBirth="1992-07-08">
   <Pet Type="dog" Name="Rover" />
 </Student>
 <Student DateOfBirth="1993-09-10" Gender="F" Name="Émily" />

</Students> )

quote = " ; " pos = 1 while, pos := RegExMatch(students, "Name=.(\w+)" . quote . "\sGender" , name, pos + 1) names .= name1 . "`n"

msgbox % names </lang>

The following code extracts the value of the property "Name" from every Student tag. It does not handle the &#CODE;; this can be left to others: a way to cope with it fastly, is to output a very simple HTML structure, so that the interpretation is left to an HTML reader/browser.

<lang awk>function parse_buf() {

   if ( match(buffer, /<Student[ \t]+[^>]*Name[ \t]*=[ \t]*"([^"]*)"/, mt) != 0 ) {
     students[mt[1]] = 1
   }
   buffer = ""

}

BEGIN {

 FS=""
 mode = 0
 buffer = ""
 li = 1

}

mode==1 {

 for(i=1; i <= NF; i++) {
   buffer = buffer $i
   if ( $i == ">" ) {
     mode = 0;
     break;
   }
 }
 if ( mode == 0 ) {
   li = i
 } else {
   li = 1
 }
 # let us process the buffer if "complete"
 if ( mode == 0 ) {
   parse_buf()
 }

}

mode==0 {

 for(i=li; i <= NF; i++) {
   if ( $i == "<" ) {
     mode = 1
     break;
   }
 }
 for(j=i; i <= NF; i++) {
   buffer = buffer $i
   if ( $i == ">" ) {
     mode = 0
     parse_buf()
   }
 }
 li = 1

}

END {

 for(k in students) {
   print k
 }

}</lang> Using getXML.awk written by Jan Weber, one could do this:

Works with: gawk

or

Works with: nawk

<lang sh>awk -f getXML.awk sample.xml | awk '

   $1 == "TAG"                 {tag = $2}
   tag == "Student" && /Name=/ {print substr($0, index($0, "=") + 1)}

'</lang> Using xmlparser.awk by Steve Coile, one can do this:

Works with: gawk

<lang sh>gawk -f xmlparser.awk sample.xml | awk '

   $1 == "begin"                                         {tag = $2}
   $1 == "attrib"                                        {attrib = $2}
   $1 == "value" && tag == "STUDENT" && attrib == "name" {print $2}

'</lang>

Both of these produce this output

April
Bob
Chad
Dave
&#x00C9;mily

C

Library: LibXML

<lang c>#include <stdio.h>

include <stdlib.h>
include <string.h>
include <libxml/parser.h>
include <libxml/tree.h>

static void print_names(xmlNode *node) {

 xmlNode *cur_node = NULL;
 for (cur_node = node; cur_node; cur_node = cur_node->next) {
   if (cur_node->type == XML_ELEMENT_NODE) {
     if ( strcmp(cur_node->name, "Student") == 0 ) {

xmlAttr *prop = NULL; if ( (prop = xmlHasProp(cur_node, "Name")) != NULL ) { printf("%s\n", prop->children->content);

}

     }
   }
   print_names(cur_node->children);
 }

}

const char *buffer =

 "<Students>\n"
 "  <Student Name=\"April\" Gender=\"F\" DateOfBirth=\"1989-01-02\" />\n"
 "  <Student Name=\"Bob\" Gender=\"M\"  DateOfBirth=\"1990-03-04\" />\n"
 "  <Student Name=\"Chad\" Gender=\"M\"  DateOfBirth=\"1991-05-06\" />\n"
 "  <Student Name=\"Dave\" Gender=\"M\"  DateOfBirth=\"1992-07-08\">\n"
 "    <Pet Type=\"dog\" Name=\"Rover\" />\n"
 "  </Student>\n"
 "  <Student DateOfBirth=\"1993-09-10\" Gender=\"F\" Name=\"Émily\" />\n"
 "</Students>\n";

int main() {

 xmlDoc *doc = NULL;
 xmlNode *root = NULL;

 doc = xmlReadMemory(buffer, strlen(buffer), NULL, NULL, 0);
 if ( doc != NULL ) {
   root = xmlDocGetRootElement(doc);
   print_names(root);
   xmlFreeDoc(doc);
 }
 xmlCleanupParser();
 return 0;

}</lang>

Common Lisp

Library: Closure XML

<lang lisp>(defparameter *xml-blob* "<Students>

 <Student Name=\"April\" Gender=\"F\" DateOfBirth=\"1989-01-02\" />
 <Student Name=\"Bob\" Gender=\"M\"  DateOfBirth=\"1990-03-04\" />
 <Student Name=\"Chad\" Gender=\"M\"  DateOfBirth=\"1991-05-06\" />
 <Student Name=\"Dave\" Gender=\"M\"  DateOfBirth=\"1992-07-08\">
   <Pet Type=\"dog\" Name=\"Rover\" />
 </Student>
 <Student DateOfBirth=\"1993-09-10\" Gender=\"F\" Name=\"Émily\" />

</Students>")

(let* ((document (cxml:parse *xml-blob* (cxml-dom:make-dom-builder)))

      (students (dom:item (dom:get-elements-by-tag-name document "Students") 0))
      (student-names '()))
 (dom:do-node-list (child (dom:child-nodes students) (nreverse student-names))
   (when (dom:element-p child)
     (push (dom:get-attribute child "Name") student-names))))</lang>

produces <lang lisp>("April" "Bob" "Chad" "Dave" "Émily")</lang>

J

J's system includes several XML processing libraries. This task is probably best addressed using XPath (this is the type of problem XPath was designed to solve), but the task description implicitly discourages that method. So we can use the SAX library instead:

<lang j> load'xml/sax'

   saxclass 'Students'
   startElement =: ([: smoutput 'Name' getAttribute~ [)^:('Student'-:])
   cocurrent'base'

   process_Students_ XML</lang>
April
Bob
Chad
Dave
Emily

and the definition of XML: <lang j> XML =: noun define

<Students>
  <Student Name="April" />
  <Student Name="Bob" />
  <Student Name="Chad" />
  <Student Name="Dave" />
  <Student Name="Emily" />
</Students>
)</lang>

OCaml

#directory "+xml-light" (* or maybe "+site-lib/xml-light" *) ;;
#load "xml-light.cma" ;;

let x = Xml.parse_string "

 <Students>
   <Student Name=\"April\" Gender=\"F\" DateOfBirth=\"1989-01-02\" />
   <Student Name=\"Bob\" Gender=\"M\"  DateOfBirth=\"1990-03-04\" />
   <Student Name=\"Chad\" Gender=\"M\"  DateOfBirth=\"1991-05-06\" />
   <Student Name=\"Dave\" Gender=\"M\"  DateOfBirth=\"1992-07-08\">
     <Pet Type=\"dog\" Name=\"Rover\" />
   </Student>
   <Student DateOfBirth=\"1993-09-10\" Gender=\"F\" Name=\"Émily\" />
 </Students>"
 in
 Xml.iter (function
   (Xml.Element ("Student", attrs, _)) ->
     (try
       let _, name = 
         List.find (function ("Name", _) -> true | _ -> false) attrs
       in
       print_endline name
     with Not_found -> ())
 | _ -> ()) x ;;

April Bob Chad Dave Émily - : unit = () </lang>

Python

<lang python>import xml.dom.minidom

doc = """<Students>

 <Student Name="April" Gender="F" DateOfBirth="1989-01-02" />
 <Student Name="Bob" Gender="M"  DateOfBirth="1990-03-04" />
 <Student Name="Chad" Gender="M"  DateOfBirth="1991-05-06" />
 <Student Name="Dave" Gender="M"  DateOfBirth="1992-07-08">
   <Pet Type="dog" Name="Rover" />
 </Student>
 <Student DateOfBirth="1993-09-10" Gender="F" Name="Émily" />

</Students>"""

doc = xml.dom.minidom.parseString(doc)

for i in doc.getElementsByTagName("Student"):

   print i.getAttribute("Name")</lang>

R

Library: XML

<lang R> library(XML)

Read in XML string

str <- readLines(tc <- textConnection('<Students>

 <Student Name="April" Gender="F" DateOfBirth="1989-01-02" />
 <Student Name="Bob" Gender="M"  DateOfBirth="1990-03-04" />
 <Student Name="Chad" Gender="M"  DateOfBirth="1991-05-06" />
 <Student Name="Dave" Gender="M"  DateOfBirth="1992-07-08">
   <Pet Type="dog" Name="Rover" />
 </Student>
 <Student DateOfBirth="1993-09-10" Gender="F" Name="Émily" />

</Students>')) close(tc) str </lang>

[1] "<Students>"                                                                 
[2] "  <Student Name=\"April\" Gender=\"F\" DateOfBirth=\"1989-01-02\" />"       
[3] "  <Student Name=\"Bob\" Gender=\"M\"  DateOfBirth=\"1990-03-04\" />"        
[4] "  <Student Name=\"Chad\" Gender=\"M\"  DateOfBirth=\"1991-05-06\" />"       
[5] "  <Student Name=\"Dave\" Gender=\"M\"  DateOfBirth=\"1992-07-08\">"         
[6] "    <Pet Type=\"dog\" Name=\"Rover\" />"                                    
[7] "  </Student>"                                                               
[8] "  <Student DateOfBirth=\"1993-09-10\" Gender=\"F\" Name=\"Émily\" />"
[9] "</Students>"

Convert to an XML tree

xmltree <- xmlTreeParse(str)

Retrieve the students, and how many there are

students <- xmltree$doc$children$Students nstudents <- length(students)

Get each of their names

studentsnames <- character(nstudents) for(i in 1:nstudents) {

  this.student <- students$children[i]$Student
  studentsnames[i] <- this.student$attributes["Name"]

}

Change the encoding so that Emily displays correctly

Encoding(studentsnames) <- "UTF-8" studentsnames </lang>

[1] "April" "Bob"   "Chad"  "Dave"  "Émily"

Ruby

Library: REXML

<lang ruby>require 'rexml/document' include REXML

doc = Document.new(File.new("sample.xml"))

or
doc = Document.new(xml_string)

without using xpath

doc.each_recursive do |node|

 puts node.attributes["Name"] if node.name == "Student"

end

using xpath

doc.each_element("*/Student") {|node| puts node.attributes["Name"]}</lang>

Slate

Slate's XML Reader is still being developed at the time of this writing.

<lang slate> slate[1]> [ |tree|

 tree: (Xml SimpleParser newOn: '<Students>
   <Student Name="April" Gender="F" DateOfBirth="1989-01-02" />
   <Student Name="Bob" Gender="M"  DateOfBirth="1990-03-04" />
   <Student Name="Chad" Gender="M"  DateOfBirth="1991-05-06" />
   <Student Name="Dave" Gender="M"  DateOfBirth="1992-07-08">
     <Pet Type="dog" Name="Rover" />
   </Student>
   <Student DateOfBirth="1993-09-10" Gender="F" Name="Émily" />
 </Students>') parse.
 tree name = 'Students' ifTrue: [(tree children select: #is: `er <- Xml Element)
                                        do: [|:child| child name = 'Student' ifTrue: [inform: (child attributes at: 'Name' ifAbsent: ['Noname'])]]].

] do. April Bob Chad Dave Émily Nil

</lang>

Tcl

Using

Library: tDOM

<lang tcl>package require tdom set tree [dom parse $xml] set studentNodes [$tree getElementsByTagName Student] ;# or: set studentNodes [[$tree documentElement] childNodes]

foreach node $studentNodes {

   puts [$node getAttribute Name]

} </lang>

Using

Library: TclXML

<lang tcl>package require xml set parser [xml::parser -elementstartcommand elem] proc elem {name attlist args} {

   if {$name eq "Student"} {
       puts [dict get $attlist Name]
   }

} $parser parse $xml</lang>

Using just pure-Tcl (originally on http://wiki.tcl.tk/3919): <lang Tcl>proc xml2list xml {

   regsub -all {>\s*<} [string trim $xml " \n\t<>"] "\} \{" xml
   set xml [string map {> "\} \{#text \{" < "\}\} \{"}  $xml]
   set res ""   ;# string to collect the result
   set stack {} ;# track open tags
   set rest {}
   foreach item "{$xml}" {
       switch -regexp -- $item {

^# {append res "{[lrange $item 0 end]} " ; #text item} ^/ { regexp {/(.+)} $item -> tagname ;# end tag set expected [lindex $stack end] set stack [lrange $stack 0 end-1] append res "\}\} "

/$ { # singleton - start and end in one <> group

               regexp {([^ ]+)( (.+))?/$} $item -> tagname - rest
               set rest [lrange [string map {= " "} $rest] 0 end]
               append res "{$tagname [list $rest] {}} "

} default {

               set tagname [lindex $item 0] ;# start tag
               set rest [lrange [string map {= " "} $item] 1 end]
               lappend stack $tagname
               append res "\{$tagname [list $rest] \{"

}

       }
   }
   string map {"\} \}" "\}\}"} [lindex $res 0]   ;#"

} proc deent str {

   regsub -all {&\#x(.+?);} $str {\\u\1} str
   subst -nocommands -novar $str

}

----------------------- Testing the whole thing:

set xml {<Students>

 <Student Name="April" Gender="F" DateOfBirth="1989-01-02" />
 <Student Name="Bob" Gender="M"  DateOfBirth="1990-03-04" />
 <Student Name="Chad" Gender="M"  DateOfBirth="1991-05-06" />
 <Student Name="Dave" Gender="M"  DateOfBirth="1992-07-08">
   <Pet Type="dog" Name="Rover" />
 </Student>
 <Student DateOfBirth="1993-09-10" Gender="F" Name="Émily" /></Students>

} foreach i [lindex [xml2list $xml] 2] {

   if {[lindex $i 0] eq "Student"} {
       foreach {att val} [lindex $i 1] {
           if {$att eq "Name"} {puts [deent $val]}
       }
   }

}

</lang>

Vedit macro language

This implementation finds all Student tags and then displays the contents of their Name parameter. <lang vedit> Repeat(ALL) {

   Search("<Student|X", ERRBREAK)
   #1 = Cur_Pos
   Match_Paren()
   if (Search_Block(/Name=|{",'}/, #1, Cur_Pos, BEGIN+ADVANCE+NOERR+NORESTORE)==0) { Continue }
   #2 = Cur_Pos
   Search(/|{",'}/)
   Type_Block(#2, Cur_Pos)
   Type_Newline

} </lang>

Output:

April
Bob
Chad
Dave
Émily

Visual Basic .NET

             <Student Name="April"/>
             <Student Name="Bob"/>
             <Student Name="Chad"/>
             <Student Name="Dave"/>
             <Student Name="Emily"/>
          </Students>

Dim names = (From node In xml...<Student> Select node.@Name).ToArray

For Each name In names

    Console.WriteLine(name)

Next </lang>

AWK

XMLgawk is an extension of the GNU Awk scripting language. Scripts in AWK are often one-liners. This one-liner implementation searches for Student tags and then displays the contents of their Name attribute. The following line is meant to be typed in on the command line of a Unix shell or an MS-DOS command window. <lang XMLgawk> gawk -lxml 'XMLSTARTELEM == "Student" {print XMLATTR["Name"]}' rosetta.xml </lang>

Output:

April
Bob
Chad
Dave
Émily

@@ Line 514: / Line 514: @@
 =={{header|AWK}}==
-[http://home.vrweb.de/~juergen.kahrs/gawk/XML/ XMLgawk] is an extension of the GNU [http://en.wikipedia.org/wiki/Awk Awk] scripting language. Scripts in AWK are often [http://en.wikipedia.org/wiki/One-liner_program one-liners]. This one-liner implementation searches for ''Student'' tags and then displays the contents of their ''Name'' attribute. The following line is meant to be typed in on the command line of a Unix shell or an MS-DOS command window.
+[http://sourceforge.net/projects/xmlgawk/ XMLgawk] is an extension of the GNU [http://en.wikipedia.org/wiki/Awk Awk] scripting language. Scripts in AWK are often [http://en.wikipedia.org/wiki/One-liner_program one-liners]. This one-liner implementation searches for ''Student'' tags and then displays the contents of their ''Name'' attribute. The following line is meant to be typed in on the command line of a Unix shell or an MS-DOS command window.
 <lang XMLgawk>
 gawk -lxml 'XMLSTARTELEM == "Student" {print XMLATTR["Name"]}' rosetta.xml