Text to HTML: Difference between revisions

Content added Content deleted

Inline

Revision as of 10:58, 5 January 2012

When developing a Website it is occasionally necessary to handle text that is received without formatting, and present it in a pleasing manner. to achieve this the text needs to be converted to HTML.

Write a converter from plain text to HTML.

The plain text has no formatting information.

It may have centered headlines, numbered sections, paragraphs, lists, and URIs. It could even have tables.

Simple converters restrict themselves at identifying paragraphs, but i believe more can be done if the text is analyzed.

You are not requested to copy the algorithm from the existing solutions but use whatever faculties available in your language to best solve the problem.

The only requirement is to ensure that the result is valid xhtml.

Pike

algorithm:

split by line
find average line length to identify centered lines
find isolated lines to identify section headings
find URIs
identify section numbering
identify bullet and numbered lists
identify paragraphs
identify indented lines
if possible identify tables

to ensure valid xhtml create a nested structure:

create an xml node
add elements to node
add lines to element if appropriate

this implementation is still incomplete. <lang Pike>// function to calculate the average line length (not used yet below) int linelength(array lines) {

   array sizes = sizeof(lines[*])-({0}); 
   sizes = sort(sizes);

   // only consider the larger half of lines minus the top 5%
   array larger = sizes[sizeof(sizes)/2..sizeof(sizes)-sizeof(sizes)/20];

   int averagelarger = `+(@larger)/sizeof(larger);
   return averagelarger;

}

array mark_up(array lines) {

   array markup = ({});

   // find special lines
   foreach(lines; int index; string line)
   {
       string strippedline = String.trim_whites(line);
       if (sizeof(strippedline))
       {
           string firstchar = strippedline[0..0];
           int pos = search(line, firstchar);

           if (lines[index-1]-" "-"\t" =="" && lines[index+1]-" "-"\t" =="")
               markup +=({ ({ "heading", strippedline, pos }) });
           else if (firstchar == "*")
               markup += ({ ({ "bullet", strippedline, pos }) });
           else if ( (<"0","1","2","3","4","5","6","7","8","9">)[firstchar] )
               markup += ({ ({ "number", strippedline, pos }) });
           else if (pos > 0)
               markup += ({ ({ "indent", strippedline, pos }) });
           else            
               markup += ({ ({ "regular", strippedline, pos }) });
       }
       else markup += ({ ({ "empty" }) });
   }

   foreach(markup; int index; array line)
   {
       if (index > 0 && index < sizeof(markup)-1 )
       {
           if (line[0] == "regular" && markup[index-1][0] != "regular" && markup[index+1][0] != "regular")
               line[0] = "heading";
       }
   }

   //find paragraphs
   foreach(markup; int index; array line)
   {
       if (index > 0 && index < sizeof(markup)-1 )
       {
           if (line[0] == "empty" && markup[index-1][0] == "regular" && markup[index+1][0] == "regular")
               line[0] = "new paragraph";
           else if (line[0] == "empty" && markup[index-1][0] == "regular" && markup[index+1][0] != "regular")
               line[0] = "end paragraph";
           else if (line[0] == "empty" && markup[index-1][0] != "regular" && markup[index+1][0] == "regular")
               line[0] = "begin paragraph";
       }
   }
   return markup;

}

object make_tree(array markup) {

   object root = Parser.XML.Tree.SimpleRootNode(); 
   object newline = Parser.XML.Tree.SimpleNode(Parser.XML.Tree.XML_TEXT, "", ([]), "\n");
   array current = ({ Parser.XML.Tree.SimpleNode(Parser.XML.Tree.XML_ELEMENT, "div", ([]), "") });
   root->add_child(current[-1]);

   foreach (markup; int index; array line)
   {
       switch(line[0])
       {
           case "heading": 
                     current[-1]->add_child(newline);
                     object h = Parser.XML.Tree.SimpleNode(Parser.XML.Tree.XML_ELEMENT, "h3", ([]), "");
                     h->add_child(Parser.XML.Tree.SimpleNode(Parser.XML.Tree.XML_TEXT, "", ([]), line[1]));
                     current[-1]->add_child(h);
                     current[-1]->add_child(newline);
                 break;
           case "bullet":
           case "number":
                     if (current[-1]->get_tag_name() == "li")
                         current = Array.pop(current)[1];
                     current[-1]->add_child(newline);
                     object li = Parser.XML.Tree.SimpleNode(Parser.XML.Tree.XML_ELEMENT, "li", ([]), "");
                     li->add_child(Parser.XML.Tree.SimpleNode(Parser.XML.Tree.XML_TEXT, "", ([]), line[1]));
                     current[-1]->add_child(li);
                     current = Array.push(current, li);
                 break;
           case "indent":
                     if (markup[index-1][0] != "bullet" && markup[index-1][0] != "number")
                         current = Array.pop(current)[1];
                     current[-1]->add_child(Parser.XML.Tree.SimpleNode(Parser.XML.Tree.XML_TEXT, "", ([]), line[1]));
                 break;
           case "new paragraph":
                     current = Array.pop(current)[1];
                     current[-1]->add_child(newline);
           case "begin paragraph":
                     object p = Parser.XML.Tree.SimpleNode(Parser.XML.Tree.XML_ELEMENT, "p", ([]), "");
                     current[-1]->add_child(p); 
                     current = Array.push(current, p);
                break;
           case "end paragraph":
                     current = Array.pop(current)[1];
                     current[-1]->add_child(newline);
                break;
           case "regular":           
                     current[-1]->add_child(Parser.XML.Tree.SimpleNode(Parser.XML.Tree.XML_TEXT, "", ([]), line[1]));
           case "empty": 
                 break;
       } 
   }   
   return root;

}</lang>

@@ Line 15: / Line 15: @@
 The only requirement is to ensure that the result is valid xhtml.
-=={{header|pike}}==
+=={{header|Pike}}==
 algorithm:
 * split by line