S-expressions: Difference between revisions

Content added Content deleted

Inline

Revision as of 02:25, 16 October 2011

S-Expressions are one convenient way to parse and store data.

Write a simple reader and writer for S-Expressions that handles quoted and unquoted strings, integers and floats.

The reader should read a single but nested S-Expression from a string and store it in a suitable datastructure (list, array, etc). Newlines and other whitespace may be ignored unless contained within a quoted string. () inside quoted strings are not interpreted, but treated as part of the string. Handling escaped quotes inside a string is optional. thus (foo"bar) maybe treated as a string 'foo"bar', or as an error.

Languages that support it may treat unquoted strings as symbols.

The reader should be able to read the following input <lang lips>((data "quoted data" 123 4.5)

(data (123 (4.5) "(more" "data)")))</lang>

and eg. in python produce a list as:

<lang python>[["data", "quoted data", 123, 4.5]

["data", [123, [4.5], "(more", "data)"]]]</lang>

The writer should be able to take the produced list and turn it into a new S-Expression. Strings that don't contain whitespace or parentheses () don't need to be quoted in the resulting S-Expression, but as a simplification, any string may be quoted.

OCaml

The file SExpr.mli:

type sexpr = Atom of string | Expr of sexpr list (** the type of S-expressions *)

val parse_string : string -> sexpr list (** parse from a string *)

val parse_ic : in_channel -> sexpr list (** parse from an input channel *)

val parse_file : string -> sexpr list (** parse from a file *)

val parse : (unit -> char option) -> sexpr list (** parse from a custom function, [None] indicates the end of the flux *)

val print_sexpr : sexpr list -> unit (** a dump function for the type [sexpr] *)

val print_sexpr_indent : sexpr list -> unit (** same than [print_sexpr] but with indentation *)

val string_of_sexpr : sexpr list -> string (** convert an expression of type [sexpr] into a string *)

val string_of_sexpr_indent : sexpr list -> string (** same than [string_of_sexpr] but with indentation *)</lang>

The file SExpr.ml:

type sexpr = Atom of string | Expr of sexpr list

type state =

 | Parse_root of sexpr list
 | Parse_content of sexpr list
 | Parse_word of Buffer.t * sexpr list
 | Parse_string of bool * Buffer.t * sexpr list

let parse pop_char =

 let rec aux st =
   match pop_char() with
   | None ->
       begin match st with
       | Parse_root sl -> (List.rev sl)
       | Parse_content _
       | Parse_word _
       | Parse_string _ ->
           failwith "Parsing error: content not closed by parenthesis"
       end
   | Some c ->
       match c with
       | '(' ->
           begin match st with
           | Parse_root sl ->
               let this = aux(Parse_content []) in
               aux(Parse_root((Expr this)::sl))
           | Parse_content sl ->
               let this = aux(Parse_content []) in
               aux(Parse_content((Expr this)::sl))
           | Parse_word(w, sl) ->
               let this = aux(Parse_content []) in
               aux(Parse_content((Expr this)::Atom(Buffer.contents w)::sl))
           | Parse_string(_, s, sl) ->
               Buffer.add_char s c;
               aux(Parse_string(false, s, sl))
           end
       | ')' ->
           begin match st with
           | Parse_root sl ->
               failwith "Parsing error: closing parenthesis without openning"
           | Parse_content sl -> (List.rev sl)
           | Parse_word(w, sl) -> List.rev(Atom(Buffer.contents w)::sl)
           | Parse_string(_, s, sl) ->
               Buffer.add_char s c;
               aux(Parse_string(false, s, sl))
           end
       | 'a'..'z' | '0'..'9'
       | 'A'..'Z' | '_' | '.'
       | '+' | '-' | '/' | '*' ->
           begin match st with
           | Parse_root _ ->
               failwith(Printf.sprintf "Parsing error: char '%c' at root level" c)
           | Parse_content sl ->
               let w = Buffer.create 16 in
               Buffer.add_char w c;
               aux(Parse_word(w, sl))
           | Parse_word(w, sl) ->
               Buffer.add_char w c;
               aux(Parse_word(w, sl))
           | Parse_string(_, s, sl) ->
               Buffer.add_char s c;
               aux(Parse_string(false, s, sl))
           end
       | ' ' | '\n' | '\r' | '\t' ->
           begin match st with
           | Parse_root sl -> aux(Parse_root sl)
           | Parse_content sl -> aux(Parse_content sl)
           | Parse_word(w, sl) -> aux(Parse_content(Atom(Buffer.contents w)::sl))
           | Parse_string(_, s, sl) ->
               Buffer.add_char s c;
               aux(Parse_string(false, s, sl))
           end
       | '"' ->
           (* " *)
           begin match st with
           | Parse_root _ -> failwith "Parse error: double quote at root level"
           | Parse_content sl ->
               let s = Buffer.create 74 in
               aux(Parse_string(false, s, sl))
           | Parse_word(w, sl) ->
               let s = Buffer.create 74 in
               aux(Parse_string(false, s, Atom(Buffer.contents w)::sl))
           | Parse_string(true, s, sl) ->
               Buffer.add_char s c;
               aux(Parse_string(false, s, sl))
           | Parse_string(false, s, sl) ->
               aux(Parse_content(Atom(Buffer.contents s)::sl))
           end
       | '\\' ->
           begin match st with
           | Parse_string(true, s, sl) ->
               Buffer.add_char s c;
               aux(Parse_string(false, s, sl))
           | Parse_string(false, s, sl) ->
               aux(Parse_string(true, s, sl))
           | _ ->
               failwith "Parsing error: escape character in wrong place"
           end
       | _ ->
           begin match st with
           | Parse_string(_, s, sl) ->
               Buffer.add_char s c;
               aux(Parse_string(false, s, sl))
           | _ ->
               failwith(Printf.sprintf "char '%c' not handled" c)
           end
 in
 aux (Parse_root [])

let string_pop_char str =

 let len = String.length str in
 let i = ref(-1) in
 (function () -> incr i; if !i >= len then None else Some(str.[!i]))

let parse_string str =

 parse (string_pop_char str)

let ic_pop_char ic =

 (function () ->
    try Some(input_char ic)
    with End_of_file -> (None))

let parse_ic ic =

 parse (ic_pop_char ic)

let parse_file filename =

 let ic = open_in filename in
 let res = parse_ic ic in
 close_in ic;
 (res)

let contains s ch =

 let len = String.length s in
 let rec aux i =
   if i >= len then false
   else if s.[i] = ch then true
   else aux (succ i)
 in
 aux 0

let contains_whitespace s =

 List.exists (contains s) [' '; '\n'; '\r'; '\t'; '('; ')']

let quote s =

 "\"" ^ s ^ "\""

let protect s =

 let s = String.escaped s in
 if contains_whitespace s then quote s else s

let string_of_sexpr s =

 let rec aux acc = function
 | (Atom tag)::tl -> aux ((protect tag)::acc) tl
 | (Expr e)::tl ->
     let s =
       "(" ^
       (String.concat " " (aux [] e))
       ^ ")"
     in
     aux (s::acc) tl
 | [] -> (List.rev acc)
 in
 String.concat " " (aux [] s)

let print_sexpr s =

 print_endline (string_of_sexpr s)

let string_of_sexpr_indent s =

 let rec aux i acc = function
 | (Atom tag)::tl -> aux i ((protect tag)::acc) tl
 | (Expr e)::tl ->
     let s =
       "\n" ^ (String.make i ' ') ^ "(" ^
       (String.concat " " (aux (succ i) [] e))
       ^ ")"
     in
     aux i (s::acc) tl
 | [] -> (List.rev acc)
 in
 String.concat "\n" (aux 0 [] s)

let print_sexpr_indent s =

 print_endline (string_of_sexpr_indent s)</lang>

Then we compile this small module and test it in the interactive loop:

$ ocamlc -c SExpr.mli
$ ocamlc -c SExpr.ml
$ ocaml SExpr.cmo 
        Objective Caml version 3.11.2

# open SExpr ;;

# let s = read_line () ;;
((data "quoted data" 123 4.5) (data (123 (4.5) "(more" "data)")))
val s : string =
  "((data \"quoted data\" 123 4.5) (data (123 (4.5) \"(more\" \"data)\")))"

# let se = SExpr.parse_string s ;;
val se : SExpr.sexpr list =
  [Expr
    [Expr [Atom "data"; Atom "quoted data"; Atom "123"; Atom "4.5"];
     Expr
      [Atom "data";
       Expr [Atom "123"; Expr [Atom "4.5"]; Atom "(more"; Atom "data)"]]]]

# SExpr.print_sexpr se ;;
((data "quoted data" 123 4.5) (data (123 (4.5) "(more" "data)")))
- : unit = ()

# SExpr.print_sexpr_indent se ;;

(
 (data "quoted data" 123 4.5) 
 (data 
  (123 
   (4.5) "(more" "data)")))
- : unit = ()

Pike

this version doesn't yet handle int and float and it doesn't remove unneeded quotes from simple strings <lang pike>string input = "((data \"quoted data\" 123 4.5)\n (data (123 (45) \"(more\" \"data)\")))";

array tokenizer(string input) {

   array output = ({}); 
   for(int i=0; i<sizeof(input); i++)
   { 
       switch(input[i])
       { 
           case '(': output+= ({"("}); break; 
           case ')': output += ({")"}); break; 
           case '"': output+=array_sscanf(input[++i..], "%s\"%[ \t\n]")[0..0]; 
                     i+=sizeof(output[-1]); 
                     break; 
           case ' ': 
           case '\t': 
           case '\n': break; 
           default: output+=array_sscanf(input[i..], "%s%[) \t\n]")[0..0]; 
                    i+=sizeof(output[-1])-1; break; 
       }
   }
   return output;

}

// this function is based on the logic in Parser.C.group() in the pike library; array group(array tokens) {

   ADT.Stack stack=ADT.Stack();
   array ret =({});

   foreach(tokens;; string token)
   {
       switch(token)
       {
           case "(": stack->push(ret); ret=({}); break;
           case ")":
                   if (!sizeof(ret) || !stack->ptr) 
                   {
                     // Mismatch
                       werror ("unmatched close parenthesis\n");
                       return ret;
                   }
                   ret=stack->pop()+({ ret }); 
                   break;
           default: ret+=({token}); break;
       }
   }
   return ret;

}

string sexp(array input) {

   array output = ({});
   foreach(input;; mixed item)
   {
       if (arrayp(item))
           output += ({ sexp(item) });
       else
           output += ({ sprintf("%O", item) });
   }
   return "("+output*" "+")";

}

array data = group(tokenizer(input))[0]; string output = sexp(data);</lang>

Output:

({({"data", "quoted data", "123", "4.5"}), ({"data", ({"123", ({"45"}), "(more", "data)"})})})
(("data" "quoted data" "123" "4.5") ("data" ("123" ("45") "(more" "data)")))