1
0
Fork 0
fgdata/Nasal/xml.nas
mfranz d92ae39349 - skip DOCTYPE
- optional attr prefix in xml.dump
2007-05-11 15:44:16 +00:00

554 lines
12 KiB
Text

# XML parser that allows to parse XML files that don't follow the FlightGear standard by
# storing information in attributes, like the crappy Traffic Manager and AI definition
# files. Currently only reading from a string is supported, and the XML 1.0 standard
# isn't fully implemented.
#
# Synopsis: xml.process_string(<xml-data:string>, <action:hash>);
# xml.process_file(<filepath>, <action:hash>);
#
# Examples:
# var n = xml.process_string("<foo>123<foo>", xml.tree, "__");
# var node = xml.process_file("foo/bar.xml", xml.tree, "__");
# if (node != nil)
# props.dump(node);
#
# The <action> interface (xml.tree) is a hash with function members begin(),
# end(), open(), close(), and data(). Its methods are called by the parser:
#
# begin(...)
# called once at the beginning; the method gets all arguments but the
# first two from the xml.process_string() call. In the above example it
# gets the "__" as arg[0].
#
# end()
# called once at the end; its return value is used as return value for
# xml.process_string()
#
# open(<tag:string>, <attr:hash>, <empty:bool>)
# called for every opening tag (empty=0) or self-closing, empty tag
# (empty=1). <tag> is the tag name, and <attr> is a hash with one
# name/value string pair per attribute.
#
# close(<tag:string>, <numchildren:int>)
# called for every closing tag, with tag name and number of child
# elements. From the latter it can be determined if the closed tag
# is a branch or a leaf node. The close() method is also called for
# self-closing tags, in which case <numchildren> is always 0.
#
# data(<string>)
# called for each data segment
#
# Example:
#
# <foo>123<bar this='is' a="test"/>456</foo>
#
# would cause these action interface calls:
#
# open("foo", {}, 0);
# data("123");
# open("bar", { this: "is", a: "test" }, 1);
# close("bar", 0);
# data("456");
# close("foo", 1);
#
#
# Predefined are two action hashes:
#
# xml.tree
#
# Synopsis: <node> = xml.process_string(<xml-string>, xml.tree, <attr-prefix:string>);
#
# Example: var node = xml.process_string("<foo>bar</foo>", xml.tree, "attr__");
#
# This parses the <xml-string> and returns it as props.Node property tree,
# which can then be processed with the known property methods, or copied
# to the main property tree: props.copy(node, props.globals.getNode("whatever", 1));
# Attributes are added as regular nodes, whereby the <attr-prefix> string is
# prepended to the attribute names. If collisions can be ruled out, then this
# prefix can be an empty string. If it's nil, then attributes are dropped
# altogether. FlightGear's standard attributes are *not* considered, as this
# parser is explicitly for non-standard XML sources. Standard files can
# easier and quicker be loaded with fgfs means.
#
# xml.dump
#
# Example: xml.process_string("<foo>bar</foo>", xml.dump);
#
# This dumps the input xml data to the terminal while parsing. It's meant for
# debugging purposes.
#
#
#
# A minimal interface hash can look like this:
#
# var do_nothing = {
# begin : func {},
# end : func {},
# open : func {},
# close : func {},
# data : func {},
# };
#
# and would be used as: xml.process_string("<foo>bar</foo>", do_nothing);
var isspace = func(c) { c == ` ` or c == `\t` or c == `\n` or c == `\r` }
var isletter = func(c) { c >= `a` and c <= `z` or c >= `A` and c <= `Z` }
var isdigit = func(c) { c >= `0` and c <= `9` }
var isalnum = func(c) { isdigit(c) or isletter(c) }
var istagfirst = func(c) { isletter(c) or c == `_` or c == `:` }
var istagother = func(c) { isalnum(c) or c == `_` or c == `:` or c == `-` or c == `.`}
var ctab = { "lt" : `<`, "gt" : `>`, "amp" : `&`, "quot" : `"`, "apos" : `'` };
var error_label = "xml.nas: ";
var error = func(msg) die(error_label ~ msg ~ scan.location());
# SCANNER =========================================================================================
##
# virtual base class: must be derived, adding get() and put()
#
var Scanner = {
new : func {
var m = { parents : [Scanner] };
m.line = 1;
m.column = 0;
m.source = " in";
return m;
},
get : func die("get() method not implemented"),
put : func die("put() method not implemented"),
skip : func(w, skipspaces = 1) {
var revert = [];
if (skipspaces) {
while (isspace(var c = scan.get()))
revert = [c] ~ revert;
scan.put(c);
}
for (var i = 0; i < size(w); i += 1) {
var c = me.get();
revert = [c] ~ revert;
if (c != w[i]) {
foreach (var r; revert)
me.put(r);
return 0;
}
}
return 1;
},
getname : func {
var s = "";
var c = me.get();
if (!istagfirst(c)) {
me.put(c);
return nil;
}
s ~= chr(c);
while (1) {
c = me.get();
if (!istagother(c))
break;
s ~= chr(c);
}
me.put(c);
return s;
},
getassign : func {
me.skip_spaces();
if (me.get() != `=`)
error("equal sign expected in assignment");
me.skip_spaces();
var s = me.getstring();
if (s == nil)
error("quoted string expected in assignment");
return s;
},
getstring : func(spc = 1) {
spc and me.skip_spaces();
var delim = me.get();
if (delim != `"` and delim != `'`) {
me.put(delim);
return nil;
}
var s = "";
while ((var c = me.get()) != nil and c != delim)
s ~= chr(c == `&` ? me.special() : c);
if (c != delim)
error("string not closed with " ~ chr(delim));
return s;
},
special : func {
var s = "";
var c = me.get();
var n = nil;
if (c == `#`) {
while ((c = me.get()) != nil and isdigit(c) and c != `;`)
s ~= chr(c);
n = num(s);
} else {
me.put(c);
while ((c = me.get()) != nil and c != `;`)
s ~= chr(c);
}
if (c != `;`)
error("entity reference not closed with ;");
if (n != nil)
return n;
if (!contains(ctab, s))
error("unknown entity reference");
return ctab[s];
},
skip_spaces : func {
var n = 0;
while (isspace(var c = me.get()))
n += 1;
me.put(c);
return n;
},
setmark : func(c) {
if (c == `\n`) {
me.line += 1;
me.column = 0;
} else {
me.column += 1;
}
},
location : func {
return me.source ~ " line " ~ me.line ~ ", column " ~ me.column;
},
dump : func {
var s = "";
while ((var c = me.get()) != nil)
s ~= chr(c);
error("REST={" ~ s ~ "}");
},
};
##
# child class of Scanner class; knows how to read characters from a string,
# and how to push them back for later use
#
var StringScanner = {
new : func(s) {
var m = Scanner.new();
m.parents = [StringScanner] ~ m.parents;
m.string = s;
m.pos = 0;
m.stack = [];
return m;
},
get : func {
if (size(me.stack))
return pop(me.stack);
if (me.pos >= size(me.string))
return nil;
var c = me.string[me.pos];
me.pos += 1;
me.setmark(c);
return c;
},
put : func {
foreach (var c; arg)
append(me.stack, c);
},
};
# PARSER ==========================================================================================
var parse_document = func(arg...) {
call(action.begin, arg, action);
parse_prolog();
if (!parse_element()) {
var c = scan.get();
if (c == nil)
error("document doesn't contain any data");
scan.put();
error("garbage");
}
parse_misc();
scan.skip_spaces();
if (scan.get() != nil)
error("trailing garbage");
return action.end();
}
var parse_prolog = func {
parse_xmldecl();
parse_misc();
parse_doctype();
parse_misc();
}
var parse_xmldecl = func {
if (!scan.skip("<?"))
return;
if (!scan.skip("xml") or !scan.skip_spaces())
error("prolog with invalid identifier. xml: expected");
if (!scan.skip("version"))
error("prolog without version statement");
scan.getassign(); # returns lvalue
if (scan.skip("encoding")) {
scan.getassign();
}
if (scan.skip("standalone")) {
var s = scan.getassign();
if (s != "yes" and s != "no")
error("standalone value must be 'yes' or 'no'");
}
if (!scan.skip("?>"))
error("prolog not closed with ?>");
}
var parse_misc = func {
while (parse_comment() or parse_pi()) {
}
}
var parse_comment = func {
if (!scan.skip("<!--"))
return 0;
while (1) {
if (scan.skip("-->"))
return 1;
if (scan.skip("--"))
error("illegal use of -- in comment");
scan.get();
}
error("unfinished comment");
}
var parse_pi = func {
if (!scan.skip("<?"))
return 0;
while (1) {
if (scan.skip("?>"))
return 1;
scan.get();
}
error("unfinished 'processing instruction'");
}
var parse_doctype = func {
if (!scan.skip("<!"))
return 0;
while (1) {
parse_doctype();
if (scan.skip(">"))
return 1;
scan.get();
}
error("unfinished doctype");
}
var parse_rawdata = func {
var c = scan.get();
if (c == `<`) {
scan.put(c);
return nil;
}
var s = chr(c);
while ((c = scan.get()) != `<` and c != nil)
s ~= chr(c == `&` ? scan.special() : c);
scan.put(c);
return s;
}
var parse_cdsect = func {
if (!scan.skip("<![CDATA["))
return nil;
var s = "";
while (1) {
if (scan.skip("]]>"))
return s;
var c = scan.get();
if (c == nil)
break;
s ~= chr(c == `&` ? scan.special() : c);
}
error("unfinished CDATA section");
}
var parse_element = func {
var open = parse_opening_tag();
if (open == nil)
return 0;
if (open[2]) {
action.close(open[0], 0);
return 1; # tag was self-closing
}
var children = 0;
while (1) {
if ((var close = parse_closing_tag()) != nil)
break;
parse_comment();
if ((var d = parse_cdsect()) != nil)
action.data(d);
if ((var d = parse_rawdata()) != nil)
action.data(d);
children += parse_element();
}
if (open[0] != close)
error("<" ~ open[0] ~ "> closed with <" ~ close ~ ">");
action.close(close, children);
return 1;
}
var parse_opening_tag = func {
if (!scan.skip("<"))
return nil;
var c = scan.get();
if (!istagfirst(c)) {
scan.put(c, `<`);
return nil;
}
scan.put(c);
var name = scan.getname(); # can't be nil
var attr = {};
while (1) {
scan.skip_spaces();
var n = scan.getname();
if (n == nil)
break;
var v = scan.getassign();
attr[n] = v;
}
if (scan.skip("/>"))
selfclosing = 1;
elsif (scan.skip(">"))
selfclosing = 0;
else
error("garbage in opening tag");
action.open(name, attr, selfclosing);
return [name, attr, selfclosing];
}
var parse_closing_tag = func {
if (!scan.skip("</"))
return nil;
var name = scan.getname();
if (name == nil)
error("closing tag without name");
if (!scan.skip(">"))
error("closing tag not ended with >");
return name;
}
# ACTION HASHES ===================================================================================
var tree = {
begin : func(prefix) {
me.prefix = prefix;
me.stack = [];
me.node = props.Node.new();
},
end : func {
return me.node;
},
open : func(name, attr) {
append(me.stack, "");
var index = size(me.node.getChildren(name));
me.node = me.node.getChild(name, index, 1);
if (me.prefix != nil)
foreach (var n; keys(attr))
me.node.getNode(me.prefix ~ n, 1).setValue(attr[n]);
},
close : func(name, children) {
var buf = pop(me.stack);
if (!children and size(buf))
me.node.setValue(buf);
me.node = me.node.getParent();
},
data : func(d) {
me.stack[-1] ~= d;
},
};
var dump = {
begin : func(prefix = "__") {
me.prefix = prefix;
me.level = 0;
},
end : func {
},
open : func(name, attr) {
me.print("<", name, ">");
me.level += 1;
foreach (var n; sort(keys(attr), cmp))
me.print("<", , me.prefix, n, ">", attr[n], "</", me.prefix, n, ">");
},
close : func(name) {
me.level -= 1;
me.print("</", name, ">");
},
data : func(data) {
for (var i = 0; i < size(data); i += 1)
if (!isspace(data[i]))
return me.print("'", data, "'");
},
print : func {
var s = "";
for (var i = 0; i < me.level; i += 1)
s ~= "\t";
call(print, [s] ~ arg);
},
};
var process = func(arg...) {
var err = [];
var ret = call(parse_document, arg, err);
if (!size(err))
return ret;
if (substr(err[0], 0, size(error_label)) != error_label)
die(err[0]); # rethrow
print(err[0]);
return nil;
}
var scan = nil;
var action = nil;
# INTERFACE =======================================================================================
var process_string = func(string, act, arg...) {
scan = StringScanner.new(string);
action = act;
return call(process, arg);
}
var process_file = func(file, act, arg...) {
scan = StringScanner.new(io.readfile(file));
scan.source = "\n in file " ~ file ~ ",";
action = act;
return call(process, arg);
}