S-expressions: Difference between revisions
(→{{header|Pike}}: handle numbers) |
|||
Line 292: | Line 292: | ||
case 'q': return name; |
case 'q': return name; |
||
case 't': return "Symbol"; |
case 't': return "Symbol"; |
||
case 'b': case 'u': case 'o': case 'x': case 'X': |
|||
if((int)name || name == "0") |
|||
return sprintf("%"+int2char(type), (int)name); |
|||
break; |
|||
case 'e': case 'E': case 'f': case 'F': case 'g': case 'G': |
|||
if((float)name || name == "0") |
|||
return sprintf("%"+int2char(type), (float)name); |
|||
break; |
|||
default: return sprintf("%"+int2char(type), name); |
default: return sprintf("%"+int2char(type), name); |
||
} |
} |
||
Line 309: | Line 301: | ||
{ |
{ |
||
case "string": return name; |
case "string": return name; |
||
case "float": return (float)name; |
|||
⚫ | |||
default: throw(sprintf("can not cast 'Symbol' to '%s'", type)); |
default: throw(sprintf("can not cast 'Symbol' to '%s'", type)); |
||
} |
} |
||
} |
} |
||
} |
|||
mixed value(string token) |
|||
{ |
|||
if ((string)(int)token==token) |
|||
⚫ | |||
array result = array_sscanf(token, "%f%s"); |
|||
if (sizeof(result) && floatp(result[0]) && ! sizeof(result[1])) |
|||
return result[0]; |
|||
else |
|||
return Symbol(token); |
|||
} |
} |
||
Line 333: | Line 334: | ||
case '\n': break; |
case '\n': break; |
||
default: string token = array_sscanf(input[i..], "%s%[) \t\n]")[0]; |
default: string token = array_sscanf(input[i..], "%s%[) \t\n]")[0]; |
||
output+=({ |
output+=({ value(token) }); |
||
i+=sizeof(token)-1; |
i+=sizeof(token)-1; |
||
break; |
break; |
||
Line 374: | Line 375: | ||
if (arrayp(item)) |
if (arrayp(item)) |
||
output += ({ sexp(item) }); |
output += ({ sexp(item) }); |
||
else if (intp(item)) |
|||
output += ({ sprintf("%d", item) }); |
|||
else if (floatp(item)) |
|||
output += ({ sprintf("%f", item) }); |
|||
else |
else |
||
output += ({ sprintf("%q", item) }); |
output += ({ sprintf("%q", item) }); |
||
Line 380: | Line 385: | ||
} |
} |
||
string input = "((data \"quoted data\" 123 4.5)\n (data (!@# ( |
string input = "((data \"quoted data\" 123 4.5)\n (data (!@# (4.5) \"(more\" \"data)\")))"; |
||
array data = group(tokenizer(input))[0]; |
array data = group(tokenizer(input))[0]; |
||
string output = sexp(data);</lang> |
string output = sexp(data);</lang> |
||
Output: |
Output: |
||
({ ({ (Symbol: data), "quoted data", |
({ ({ (Symbol: data), "quoted data", 123, 4.5 }), ({ (Symbol: data), ({ (Symbol: !@#), ({ 4.5 }), "(more", "data)" }) }) }) |
||
((data "quoted data" 123 4.5) (data (!@# (45) "(more" "data)"))) |
((data "quoted data" 123 4.5) (data (!@# (45) "(more" "data)"))) |
||
Revision as of 14:47, 16 October 2011
S-Expressions are one convenient way to parse and store data.
Write a simple reader and writer for S-Expressions that handles quoted and unquoted strings, integers and floats.
The reader should read a single but nested S-Expression from a string and store it in a suitable datastructure (list, array, etc). Newlines and other whitespace may be ignored unless contained within a quoted string. () inside quoted strings are not interpreted, but treated as part of the string. Handling escaped quotes inside a string is optional. thus (foo"bar) maybe treated as a string 'foo"bar', or as an error.
Languages that support it may treat unquoted strings as symbols.
Note that with the exception of ()" (\ if escaping is supported) and whitespace there are no special characters. Anything else is allowed without quotes.
The reader should be able to read the following input <lang lips>((data "quoted data" 123 4.5)
(data (!@# (4.5) "(more" "data)")))</lang>
and eg. in python produce a list as:
<lang python>[["data", "quoted data", 123, 4.5]
["data", ["!@#", [4.5], "(more", "data)"]]]</lang>
The writer should be able to take the produced list and turn it into a new S-Expression. Strings that don't contain whitespace or parentheses () don't need to be quoted in the resulting S-Expression, but as a simplification, any string may be quoted.
OCaml
The file SExpr.mli
:
<lang ocaml>(** This module is a very simple parsing library for S-expressions. *) (* Copyright (C) 2009 Florent Monnier, released under MIT license. *)
type sexpr = Atom of string | Expr of sexpr list (** the type of S-expressions *)
val parse_string : string -> sexpr list (** parse from a string *)
val parse_ic : in_channel -> sexpr list (** parse from an input channel *)
val parse_file : string -> sexpr list (** parse from a file *)
val parse : (unit -> char option) -> sexpr list (** parse from a custom function, [None] indicates the end of the flux *)
val print_sexpr : sexpr list -> unit (** a dump function for the type [sexpr] *)
val print_sexpr_indent : sexpr list -> unit (** same than [print_sexpr] but with indentation *)
val string_of_sexpr : sexpr list -> string (** convert an expression of type [sexpr] into a string *)
val string_of_sexpr_indent : sexpr list -> string (** same than [string_of_sexpr] but with indentation *)</lang>
The file SExpr.ml
:
<lang ocaml>(** This module is a very simple parsing library for S-expressions. *) (* Copyright (C) 2009 Florent Monnier, released under MIT license. *) (* modified to match the task description *)
type sexpr = Atom of string | Expr of sexpr list
type state =
| Parse_root of sexpr list | Parse_content of sexpr list | Parse_word of Buffer.t * sexpr list | Parse_string of bool * Buffer.t * sexpr list
let parse pop_char =
let rec aux st = match pop_char() with | None -> begin match st with | Parse_root sl -> (List.rev sl) | Parse_content _ | Parse_word _ | Parse_string _ -> failwith "Parsing error: content not closed by parenthesis" end | Some c -> match c with | '(' -> begin match st with | Parse_root sl -> let this = aux(Parse_content []) in aux(Parse_root((Expr this)::sl)) | Parse_content sl -> let this = aux(Parse_content []) in aux(Parse_content((Expr this)::sl)) | Parse_word(w, sl) -> let this = aux(Parse_content []) in aux(Parse_content((Expr this)::Atom(Buffer.contents w)::sl)) | Parse_string(_, s, sl) -> Buffer.add_char s c; aux(Parse_string(false, s, sl)) end | ')' -> begin match st with | Parse_root sl -> failwith "Parsing error: closing parenthesis without openning" | Parse_content sl -> (List.rev sl) | Parse_word(w, sl) -> List.rev(Atom(Buffer.contents w)::sl) | Parse_string(_, s, sl) -> Buffer.add_char s c; aux(Parse_string(false, s, sl)) end | ' ' | '\n' | '\r' | '\t' -> begin match st with | Parse_root sl -> aux(Parse_root sl) | Parse_content sl -> aux(Parse_content sl) | Parse_word(w, sl) -> aux(Parse_content(Atom(Buffer.contents w)::sl)) | Parse_string(_, s, sl) -> Buffer.add_char s c; aux(Parse_string(false, s, sl)) end | '"' -> (* '"' *) begin match st with | Parse_root _ -> failwith "Parse error: double quote at root level" | Parse_content sl -> let s = Buffer.create 74 in aux(Parse_string(false, s, sl)) | Parse_word(w, sl) -> let s = Buffer.create 74 in aux(Parse_string(false, s, Atom(Buffer.contents w)::sl)) | Parse_string(true, s, sl) -> Buffer.add_char s c; aux(Parse_string(false, s, sl)) | Parse_string(false, s, sl) -> aux(Parse_content(Atom(Buffer.contents s)::sl)) end | '\\' -> begin match st with | Parse_string(true, s, sl) -> Buffer.add_char s c; aux(Parse_string(false, s, sl)) | Parse_string(false, s, sl) -> aux(Parse_string(true, s, sl)) | _ -> failwith "Parsing error: escape character in wrong place" end | _ -> begin match st with | Parse_root _ -> failwith(Printf.sprintf "Parsing error: char '%c' at root level" c) | Parse_content sl -> let w = Buffer.create 16 in Buffer.add_char w c; aux(Parse_word(w, sl)) | Parse_word(w, sl) -> Buffer.add_char w c; aux(Parse_word(w, sl)) | Parse_string(_, s, sl) -> Buffer.add_char s c; aux(Parse_string(false, s, sl)) end in aux (Parse_root [])
let string_pop_char str =
let len = String.length str in let i = ref(-1) in (function () -> incr i; if !i >= len then None else Some(str.[!i]))
let parse_string str =
parse (string_pop_char str)
let ic_pop_char ic =
(function () -> try Some(input_char ic) with End_of_file -> (None))
let parse_ic ic =
parse (ic_pop_char ic)
let parse_file filename =
let ic = open_in filename in let res = parse_ic ic in close_in ic; (res)
let contains s ch =
let len = String.length s in let rec aux i = if i >= len then false else if s.[i] = ch then true else aux (succ i) in aux 0
let contains_whitespace s =
List.exists (contains s) [' '; '\n'; '\r'; '\t'; '('; ')']
let quote s =
"\"" ^ s ^ "\""
let protect s =
let s = String.escaped s in if contains_whitespace s then quote s else s
let string_of_sexpr s =
let rec aux acc = function | (Atom tag)::tl -> aux ((protect tag)::acc) tl | (Expr e)::tl -> let s = "(" ^ (String.concat " " (aux [] e)) ^ ")" in aux (s::acc) tl | [] -> (List.rev acc) in String.concat " " (aux [] s)
let print_sexpr s =
print_endline (string_of_sexpr s)
let string_of_sexpr_indent s =
let rec aux i acc = function | (Atom tag)::tl -> aux i ((protect tag)::acc) tl | (Expr e)::tl -> let s = "\n" ^ (String.make i ' ') ^ "(" ^ (String.concat " " (aux (succ i) [] e)) ^ ")" in aux i (s::acc) tl | [] -> (List.rev acc) in String.concat "\n" (aux 0 [] s)
let print_sexpr_indent s =
print_endline (string_of_sexpr_indent s)</lang>
Then we compile this small module and test it in the interactive loop:
$ ocamlc -c SExpr.mli $ ocamlc -c SExpr.ml $ ocaml SExpr.cmo Objective Caml version 3.11.2 # open SExpr ;; # let s = read_line () ;; ((data "quoted data" 123 4.5) (data (123 (4.5) "(more" "data)"))) val s : string = "((data \"quoted data\" 123 4.5) (data (!@# (4.5) \"(more\" \"data)\")))" # let se = SExpr.parse_string s ;; val se : SExpr.sexpr list = [Expr [Expr [Atom "data"; Atom "quoted data"; Atom "123"; Atom "4.5"]; Expr [Atom "data"; Expr [Atom "!@#"; Expr [Atom "4.5"]; Atom "(more"; Atom "data)"]]]] # SExpr.print_sexpr se ;; ((data "quoted data" 123 4.5) (data (!@# (4.5) "(more" "data)"))) - : unit = () # SExpr.print_sexpr_indent se ;; ( (data "quoted data" 123 4.5) (data (!@# (4.5) "(more" "data)"))) - : unit = ()
Pike
<lang pike>class Symbol(string name) {
string _sprintf(int type) { switch(type) { case 's': return name; case 'O': return sprintf("(Symbol: %s)", name||""); case 'q': return name; case 't': return "Symbol"; default: return sprintf("%"+int2char(type), name); } }
mixed cast(string type) { switch(type) { case "string": return name; default: throw(sprintf("can not cast 'Symbol' to '%s'", type)); } }
}
mixed value(string token) {
if ((string)(int)token==token) return (int)token; array result = array_sscanf(token, "%f%s"); if (sizeof(result) && floatp(result[0]) && ! sizeof(result[1])) return result[0]; else return Symbol(token);
}
array tokenizer(string input) {
array output = ({}); for(int i=0; i<sizeof(input); i++) { switch(input[i]) { case '(': output+= ({"("}); break; case ')': output += ({")"}); break; case '"': //" output+=array_sscanf(input[++i..], "%s\"%[ \t\n]")[0..0]; i+=sizeof(output[-1]); break; case ' ': case '\t': case '\n': break; default: string token = array_sscanf(input[i..], "%s%[) \t\n]")[0]; output+=({ value(token) }); i+=sizeof(token)-1; break; } } return output;
}
// this function is based on the logic in Parser.C.group() in the pike library; array group(array tokens) {
ADT.Stack stack=ADT.Stack(); array ret =({});
foreach(tokens;; string token) { switch(token) { case "(": stack->push(ret); ret=({}); break; case ")": if (!sizeof(ret) || !stack->ptr) { // Mismatch werror ("unmatched close parenthesis\n"); return ret; } ret=stack->pop()+({ ret }); break; default: ret+=({token}); break; } } return ret;
}
string sexp(array input) {
array output = ({}); foreach(input;; mixed item) { if (arrayp(item)) output += ({ sexp(item) }); else if (intp(item)) output += ({ sprintf("%d", item) }); else if (floatp(item)) output += ({ sprintf("%f", item) }); else output += ({ sprintf("%q", item) }); } return "("+output*" "+")";
}
string input = "((data \"quoted data\" 123 4.5)\n (data (!@# (4.5) \"(more\" \"data)\")))"; array data = group(tokenizer(input))[0]; string output = sexp(data);</lang>
Output:
({ ({ (Symbol: data), "quoted data", 123, 4.5 }), ({ (Symbol: data), ({ (Symbol: !@#), ({ 4.5 }), "(more", "data)" }) }) }) ((data "quoted data" 123 4.5) (data (!@# (45) "(more" "data)")))
Python
<lang python>import re
dbg = False
term_regex = r(?mx)
\s*(?: (?P<brackl>\()| (?P<brackr>\))| (?P<num>\d+\.\d+|\d+)\b| (?P<sq>"[^"]*")| (?P\S+)\b )
def parse_sexp(sexp):
stack = [] out = [] if dbg: print("%-6s %-14s %-44s %-s" % tuple("term value out stack".split())) for termtypes in re.finditer(term_regex, sexp): term, value = [(t,v) for t,v in termtypes.groupdict().items() if v][0] if dbg: print("%-7s %-14s %-44r %-r" % (term, value, out, stack)) if term == 'brackl': stack.append(out) out = [] elif term == 'brackr': assert stack, "Trouble with nesting of brackets" tmpout, out = out, stack.pop(-1) out.append(tmpout) elif term == 'num': v = float(value) if v.is_integer(): v = int(v) out.append(v) elif term == 'sq': out.append(value[1:-1]) elif term == 's': out.append(value) else: raise NotImplementedError("Error: %r" % (term, value)) assert not stack, "Trouble with nesting of brackets" return out[0]
def print_sexp(exp):
out = if type(exp) == type([]): out += '(' + ' '.join(print_sexp(x) for x in exp) + ')' elif type(exp) == type() and re.search(r'[\s()]', exp): out += ' "%s"' % repr(exp)[1:-1].replace('"', '\"') else: out += ' %s' % exp return out
if __name__ == '__main__':
sexp = ( ( data "quoted data" 123 4.5) (data (123 (4.5) "(more" "data)")))
print('Input S-expression: %r' % (sexp, )) parsed = parse_sexp(sexp) print("\nParsed to Python:", parsed)
print("\nThen back to: '%s'" % print_sexp(parsed))</lang>
- Output
Input S-expression: ' ( ( data "quoted data" 123 4.5)\n (data (123 (4.5) "(more" "data)")))' Parsed to Python: [['data', 'quoted data', 123, 4.5], ['data', [123, [4.5], '(more', 'data)']]] Then back to: '(( data "quoted data" 123 4.5) ( data ( 123 ( 4.5) "(more" "data)")))'