S-expressions
S-Expressions are one convenient way to parse and store data.
Write a simple reader and writer for S-Expressions that handles quoted and unquoted strings, integers and floats.
The reader should read a single but nested S-Expression from a string and store it in a suitable datastructure (list, array, etc). Newlines and other whitespace may be ignored unless contained within a quoted string. () inside quoted strings are not interpreted, but treated as part of the string. Handling escaped quotes inside a string is optional. thus (foo"bar) maybe treated as a string 'foo"bar', or as an error.
Languages that support it may treat unquoted strings as symbols.
The reader should be able to read the following input <lang lips>((data "quoted data" 123 4.5)
(data (123 (4.5) "(more" "data)")))</lang>
and eg. in python produce a list as:
<lang python>[["data", "quoted data", 123, 4.5]
["data", [123, [4.5], "(more", "data)"]]]</lang>
The writer should be able to take the produced list and turn it into a new S-Expression. Strings that don't contain whitespace or parentheses () don't need to be quoted in the resulting S-Expression, but as a simplification, any string may be quoted.
OCaml
The file SExpr.mli
:
<lang ocaml>(** This module is a very simple parsing library for S-expressions. *) (* Copyright (C) 2009 Florent Monnier, released under MIT license. *)
type sexpr = Atom of string | Expr of sexpr list (** the type of S-expressions *)
val parse_string : string -> sexpr list (** parse from a string *)
val parse_ic : in_channel -> sexpr list (** parse from an input channel *)
val parse_file : string -> sexpr list (** parse from a file *)
val parse : (unit -> char option) -> sexpr list (** parse from a custom function, [None] indicates the end of the flux *)
val print_sexpr : sexpr list -> unit (** a dump function for the type [sexpr] *)
val print_sexpr_indent : sexpr list -> unit (** same than [print_sexpr] but with indentation *)
val string_of_sexpr : sexpr list -> string (** convert an expression of type [sexpr] into a string *)
val string_of_sexpr_indent : sexpr list -> string (** same than [string_of_sexpr] but with indentation *)</lang>
The file SExpr.ml
:
<lang ocaml>(** This module is a very simple parsing library for S-expressions. *) (* Copyright (C) 2009 Florent Monnier, released under MIT license. *)
type sexpr = Atom of string | Expr of sexpr list
type state =
| Parse_root of sexpr list | Parse_content of sexpr list | Parse_word of Buffer.t * sexpr list | Parse_string of bool * Buffer.t * sexpr list
let parse pop_char =
let rec aux st = match pop_char() with | None -> begin match st with | Parse_root sl -> (List.rev sl) | Parse_content _ | Parse_word _ | Parse_string _ -> failwith "Parsing error: content not closed by parenthesis" end | Some c -> match c with | '(' -> begin match st with | Parse_root sl -> let this = aux(Parse_content []) in aux(Parse_root((Expr this)::sl)) | Parse_content sl -> let this = aux(Parse_content []) in aux(Parse_content((Expr this)::sl)) | Parse_word(w, sl) -> let this = aux(Parse_content []) in aux(Parse_content((Expr this)::Atom(Buffer.contents w)::sl)) | Parse_string(_, s, sl) -> Buffer.add_char s c; aux(Parse_string(false, s, sl)) end | ')' -> begin match st with | Parse_root sl -> failwith "Parsing error: closing parenthesis without openning" | Parse_content sl -> (List.rev sl) | Parse_word(w, sl) -> List.rev(Atom(Buffer.contents w)::sl) | Parse_string(_, s, sl) -> Buffer.add_char s c; aux(Parse_string(false, s, sl)) end | 'a'..'z' | '0'..'9' | 'A'..'Z' | '_' | '.' | '+' | '-' | '/' | '*' -> begin match st with | Parse_root _ -> failwith(Printf.sprintf "Parsing error: char '%c' at root level" c) | Parse_content sl -> let w = Buffer.create 16 in Buffer.add_char w c; aux(Parse_word(w, sl)) | Parse_word(w, sl) -> Buffer.add_char w c; aux(Parse_word(w, sl)) | Parse_string(_, s, sl) -> Buffer.add_char s c; aux(Parse_string(false, s, sl)) end | ' ' | '\n' | '\r' | '\t' -> begin match st with | Parse_root sl -> aux(Parse_root sl) | Parse_content sl -> aux(Parse_content sl) | Parse_word(w, sl) -> aux(Parse_content(Atom(Buffer.contents w)::sl)) | Parse_string(_, s, sl) -> Buffer.add_char s c; aux(Parse_string(false, s, sl)) end | '"' -> begin match st with | Parse_root _ -> failwith "Parse error: double quote at root level" | Parse_content sl -> let s = Buffer.create 74 in aux(Parse_string(false, s, sl)) | Parse_word(w, sl) -> let s = Buffer.create 74 in aux(Parse_string(false, s, Atom(Buffer.contents w)::sl)) | Parse_string(true, s, sl) -> Buffer.add_char s c; aux(Parse_string(false, s, sl)) | Parse_string(false, s, sl) -> aux(Parse_content(Atom(Buffer.contents s)::sl)) end | '\\' -> begin match st with | Parse_string(true, s, sl) -> Buffer.add_char s c; aux(Parse_string(false, s, sl)) | Parse_string(false, s, sl) -> aux(Parse_string(true, s, sl)) | _ -> failwith "Parsing error: escape character in wrong place" end | _ -> begin match st with | Parse_string(_, s, sl) -> Buffer.add_char s c; aux(Parse_string(false, s, sl)) | _ -> failwith(Printf.sprintf "char '%c' not handled" c) end in aux (Parse_root [])
let string_pop_char str =
let len = String.length str in let i = ref(-1) in (function () -> incr i; if !i >= len then None else Some(str.[!i]))
let parse_string str =
parse (string_pop_char str)
let ic_pop_char ic =
(function () -> try Some(input_char ic) with End_of_file -> (None))
let parse_ic ic =
parse (ic_pop_char ic)
let parse_file filename =
let ic = open_in filename in let res = parse_ic ic in close_in ic; (res)
let contains s ch =
let len = String.length s in let rec aux i = if i >= len then false else if s.[i] = ch then true else aux (succ i) in aux 0
let contains_whitespace s =
List.exists (contains s) [' '; '\n'; '\r'; '\t'; '('; ')']
let quote s =
"\"" ^ s ^ "\""
let protect s =
let s = String.escaped s in if contains_whitespace s then quote s else s
let string_of_sexpr s =
let rec aux acc = function | (Atom tag)::tl -> aux ((protect tag)::acc) tl | (Expr e)::tl -> let s = "(" ^ (String.concat " " (aux [] e)) ^ ")" in aux (s::acc) tl | [] -> (List.rev acc) in String.concat " " (aux [] s)
let print_sexpr s =
print_endline (string_of_sexpr s)
let string_of_sexpr_indent s =
let rec aux i acc = function | (Atom tag)::tl -> aux i ((protect tag)::acc) tl | (Expr e)::tl -> let s = "\n" ^ (String.make i ' ') ^ "(" ^ (String.concat " " (aux (succ i) [] e)) ^ ")" in aux i (s::acc) tl | [] -> (List.rev acc) in String.concat "\n" (aux 0 [] s)
let print_sexpr_indent s =
print_endline (string_of_sexpr_indent s)</lang>
Then we compile this small module and test it in the interactive loop:
$ ocamlc -c SExpr.mli $ ocamlc -c SExpr.ml $ ocaml SExpr.cmo Objective Caml version 3.11.2 # open SExpr ;; # let s = read_line () ;; ((data "quoted data" 123 4.5) (data (123 (4.5) "(more" "data)"))) val s : string = "((data \"quoted data\" 123 4.5) (data (123 (4.5) \"(more\" \"data)\")))" # let se = SExpr.parse_string s ;; val se : SExpr.sexpr list = [Expr [Expr [Atom "data"; Atom "quoted data"; Atom "123"; Atom "4.5"]; Expr [Atom "data"; Expr [Atom "123"; Expr [Atom "4.5"]; Atom "(more"; Atom "data)"]]]] # SExpr.print_sexpr se ;; ((data "quoted data" 123 4.5) (data (123 (4.5) "(more" "data)"))) - : unit = () # SExpr.print_sexpr_indent se ;; ( (data "quoted data" 123 4.5) (data (123 (4.5) "(more" "data)"))) - : unit = ()
Pike
this version doesn't yet handle int and float and it doesn't remove unneeded quotes from simple strings <lang pike>string input = "((data \"quoted data\" 123 4.5)\n (data (123 (45) \"(more\" \"data)\")))";
array tokenizer(string input) {
array output = ({}); for(int i=0; i<sizeof(input); i++) { switch(input[i]) { case '(': output+= ({"("}); break; case ')': output += ({")"}); break; case '"': output+=array_sscanf(input[++i..], "%s\"%[ \t\n]")[0..0]; i+=sizeof(output[-1]); break; case ' ': case '\t': case '\n': break; default: output+=array_sscanf(input[i..], "%s%[) \t\n]")[0..0]; i+=sizeof(output[-1])-1; break; } } return output;
}
// this function is based on the logic in Parser.C.group() in the pike library; array group(array tokens) {
ADT.Stack stack=ADT.Stack(); array ret =({});
foreach(tokens;; string token) { switch(token) { case "(": stack->push(ret); ret=({}); break; case ")": if (!sizeof(ret) || !stack->ptr) { // Mismatch werror ("unmatched close parenthesis\n"); return ret; } ret=stack->pop()+({ ret }); break; default: ret+=({token}); break; } } return ret;
}
string sexp(array input) {
array output = ({}); foreach(input;; mixed item) { if (arrayp(item)) output += ({ sexp(item) }); else output += ({ sprintf("%O", item) }); } return "("+output*" "+")";
}
array data = group(tokenizer(input))[0]; string output = sexp(data);</lang>
Output:
({({"data", "quoted data", "123", "4.5"}), ({"data", ({"123", ({"45"}), "(more", "data)"})})}) (("data" "quoted data" "123" "4.5") ("data" ("123" ("45") "(more" "data)")))