if not modules then modules = { } end modules ['lxml-tab'] = {
version = 1.001,
comment = "this module is the basis for the lxml-* ones",
author = "Hans Hagen, PRAGMA-ADE, Hasselt NL",
copyright = "PRAGMA ADE / ConTeXt Development Team",
license = "see context related readme files"
}
-- this module needs a cleanup: check latest lpeg, passing args, (sub)grammar, etc etc
-- stripping spaces from e.g. cont-en.xml saves .2 sec runtime so it's not worth the
-- trouble
-- todo: when serializing optionally remap named entities to hex (if known in char-ent.lua)
-- maybe when letter -> utf, else name .. then we need an option to the serializer .. a bit
-- of work so we delay this till we cleanup
local trace_entities = false trackers.register("xml.entities", function(v) trace_entities = v end)
local report_xml = logs and logs.reporter("xml","core") or function(...) print(string.format(...)) end
--[[ldx--
The parser used here is inspired by the variant discussed in the lua book, but
handles comment and processing instructions, has a different structure, provides
parent access; a first version used different trickery but was less optimized to we
went this route. First we had a find based parser, now we have an based one.
The find based parser can be found in l-xml-edu.lua along with other older code.
Beware, the interface may change. For instance at, ns, tg, dt may get more
verbose names. Once the code is stable we will also remove some tracing and
optimize the code.
I might even decide to reimplement the parser using the latest trickery
as the current variant was written when showed up and it's easier now to
build tables in one go.
--ldx]]--
if lpeg.setmaxstack then lpeg.setmaxstack(1000) end -- deeply nested xml files
xml = xml or { }
local xml = xml
--~ local xml = xml
local concat, remove, insert = table.concat, table.remove, table.insert
local type, next, setmetatable, getmetatable, tonumber, rawset = type, next, setmetatable, getmetatable, tonumber, rawset
local lower, find, match, gsub = string.lower, string.find, string.match, string.gsub
local utfchar = utf.char
local lpegmatch, lpegpatterns = lpeg.match, lpeg.patterns
local P, S, R, C, V, C, Cs = lpeg.P, lpeg.S, lpeg.R, lpeg.C, lpeg.V, lpeg.C, lpeg.Cs
local formatters = string.formatters
--[[ldx--
First a hack to enable namespace resolving. A namespace is characterized by
a . The following function associates a namespace prefix with a
pattern. We use , which in this case is more than twice as fast as a
find based solution where we loop over an array of patterns. Less code and
much cleaner.
--ldx]]--
xml.xmlns = xml.xmlns or { }
local check = P(false)
local parse = check
--[[ldx--
The next function associates a namespace prefix with an . This
normally happens independent of parsing.
xml.registerns("mml","mathml")
--ldx]]--
function xml.registerns(namespace, pattern) -- pattern can be an lpeg
check = check + C(P(lower(pattern))) / namespace
parse = P { P(check) + 1 * V(1) }
end
--[[ldx--
The next function also registers a namespace, but this time we map a
given namespace prefix onto a registered one, using the given
. This used for attributes like xmlns:m.
xml.checkns("m","http://www.w3.org/mathml")
--ldx]]--
function xml.checkns(namespace,url)
local ns = lpegmatch(parse,lower(url))
if ns and namespace ~= ns then
xml.xmlns[namespace] = ns
end
end
--[[ldx--
Next we provide a way to turn an into a registered
namespace. This used for the xmlns attribute.
resolvedns = xml.resolvens("http://www.w3.org/mathml")
This returns mml.
--ldx]]--
function xml.resolvens(url)
return lpegmatch(parse,lower(url)) or ""
end
--[[ldx--
A namespace in an element can be remapped onto the registered
one efficiently by using the xml.xmlns table.
--ldx]]--
--[[ldx--
This version uses . We follow the same approach as before, stack and top and
such. This version is about twice as fast which is mostly due to the fact that
we don't have to prepare the stream for cdata, doctype etc etc. This variant is
is dedicated to Luigi Scarso, who challenged me with 40 megabyte files that
took 12.5 seconds to load (1.5 for file io and the rest for tree building). With
the implementation we got that down to less 7.3 seconds. Loading the 14
interface definition files (2.6 meg) went down from 1.05 seconds to 0.55.
Next comes the parser. The rather messy doctype definition comes in many
disguises so it is no surprice that later on have to dedicate quite some
code to it.
The code may look a bit complex but this is mostly due to the fact that we
resolve namespaces and attach metatables. There is only one public function:
local x = xml.convert(somestring)
An optional second boolean argument tells this function not to create a root
element.
Valid entities are:
--ldx]]--
-- not just one big nested table capture (lpeg overflow)
local nsremap, resolvens = xml.xmlns, xml.resolvens
local stack = { }
local top = { }
local dt = { }
local at = { }
local xmlns = { }
local errorstr = nil
local entities = { }
local strip = false
local cleanup = false
local utfize = false
local resolve_predefined = false
local unify_predefined = false
local dcache = { }
local hcache = { }
local acache = { }
local mt = { }
local function initialize_mt(root)
mt = { __index = root } -- will be redefined later
end
function xml.setproperty(root,k,v)
getmetatable(root).__index[k] = v
end
function xml.checkerror(top,toclose)
return "" -- can be set
end
local function add_attribute(namespace,tag,value)
if cleanup and #value > 0 then
value = cleanup(value) -- new
end
if tag == "xmlns" then
xmlns[#xmlns+1] = resolvens(value)
at[tag] = value
elseif namespace == "" then
at[tag] = value
elseif namespace == "xmlns" then
xml.checkns(tag,value)
at["xmlns:" .. tag] = value
else
-- for the moment this way:
at[namespace .. ":" .. tag] = value
end
end
local function add_empty(spacing, namespace, tag)
if #spacing > 0 then
dt[#dt+1] = spacing
end
local resolved = namespace == "" and xmlns[#xmlns] or nsremap[namespace] or namespace
top = stack[#stack]
dt = top.dt
local t = { ns=namespace or "", rn=resolved, tg=tag, at=at, dt={}, __p__ = top }
dt[#dt+1] = t
setmetatable(t, mt)
if at.xmlns then
remove(xmlns)
end
at = { }
end
local function add_begin(spacing, namespace, tag)
if #spacing > 0 then
dt[#dt+1] = spacing
end
local resolved = namespace == "" and xmlns[#xmlns] or nsremap[namespace] or namespace
top = { ns=namespace or "", rn=resolved, tg=tag, at=at, dt={}, __p__ = stack[#stack] }
setmetatable(top, mt)
dt = top.dt
stack[#stack+1] = top
at = { }
end
local function add_end(spacing, namespace, tag)
if #spacing > 0 then
dt[#dt+1] = spacing
end
local toclose = remove(stack)
top = stack[#stack]
if #stack < 1 then
errorstr = formatters["unable to close %s %s"](tag,xml.checkerror(top,toclose) or "")
report_xml(errorstr)
elseif toclose.tg ~= tag then -- no namespace check
errorstr = formatters["unable to close %s with %s %s"](toclose.tg,tag,xml.checkerror(top,toclose) or "")
report_xml(errorstr)
end
dt = top.dt
dt[#dt+1] = toclose
-- dt[0] = top -- nasty circular reference when serializing table
if toclose.at.xmlns then
remove(xmlns)
end
end
-- local function add_text(text)
-- if cleanup and #text > 0 then
-- dt[#dt+1] = cleanup(text)
-- else
-- dt[#dt+1] = text
-- end
-- end
local spaceonly = lpegpatterns.whitespace^0 * P(-1)
local function add_text(text)
local n = #dt
--
-- will be an option: dataonly
--
-- if #text == 0 or lpegmatch(spaceonly,text) then
-- return
-- end
--
if cleanup and #text > 0 then
if n > 0 then
local s = dt[n]
if type(s) == "string" then
dt[n] = s .. cleanup(text)
else
dt[n+1] = cleanup(text)
end
else
dt[1] = cleanup(text)
end
else
if n > 0 then
local s = dt[n]
if type(s) == "string" then
dt[n] = s .. text
else
dt[n+1] = text
end
else
dt[1] = text
end
end
end
local function add_special(what, spacing, text)
if #spacing > 0 then
dt[#dt+1] = spacing
end
if strip and (what == "@cm@" or what == "@dt@") then
-- forget it
else
dt[#dt+1] = { special=true, ns="", tg=what, dt={ text } }
end
end
local function set_message(txt)
errorstr = "garbage at the end of the file: " .. gsub(txt,"([ \n\r\t]*)","")
end
local reported_attribute_errors = { }
local function attribute_value_error(str)
if not reported_attribute_errors[str] then
report_xml("invalid attribute value %a",str)
reported_attribute_errors[str] = true
at._error_ = str
end
return str
end
local function attribute_specification_error(str)
if not reported_attribute_errors[str] then
report_xml("invalid attribute specification %a",str)
reported_attribute_errors[str] = true
at._error_ = str
end
return str
end
local badentity = "&error;"
local badentity = "&"
xml.placeholders = {
unknown_dec_entity = function(str) return str == "" and badentity or formatters["&%s;"](str) end,
unknown_hex_entity = function(str) return formatters["%s;"](str) end,
unknown_any_entity = function(str) return formatters["%s;"](str) end,
}
local placeholders = xml.placeholders
local function fromhex(s)
local n = tonumber(s,16)
if n then
return utfchar(n)
else
return formatters["h:%s"](s), true
end
end
local function fromdec(s)
local n = tonumber(s)
if n then
return utfchar(n)
else
return formatters["d:%s"](s), true
end
end
-- one level expansion (simple case), no checking done
local p_rest = (1-P(";"))^0
local p_many = P(1)^0
local p_char = lpegpatterns.utf8character
local parsedentity =
P("&") * (P("#x")*(p_rest/fromhex) + P("#")*(p_rest/fromdec)) * P(";") * P(-1) +
(P("#x")*(p_many/fromhex) + P("#")*(p_many/fromdec))
-- parsing in the xml file
local predefined_unified = {
[38] = "&",
[42] = """,
[47] = "'",
[74] = "<",
[76] = ">",
}
local predefined_simplified = {
[38] = "&", amp = "&",
[42] = '"', quot = '"',
[47] = "'", apos = "'",
[74] = "<", lt = "<",
[76] = ">", gt = ">",
}
local nofprivates = 0xF0000 -- shared but seldom used
local privates_u = { -- unescaped
[ [[&]] ] = "&",
[ [["]] ] = """,
[ [[']] ] = "'",
[ [[<]] ] = "<",
[ [[>]] ] = ">",
}
local privates_p = {
}
local privates_n = {
-- keeps track of defined ones
}
-- -- local escaped = utf.remapper(privates_u) -- can't be used as it freezes
-- -- local unprivatized = utf.remapper(privates_p) -- can't be used as it freezes
--
-- local p_privates_u = false
-- local p_privates_p = false
--
-- table.setmetatablenewindex(privates_u,function(t,k,v) rawset(t,k,v) p_privates_u = false end)
-- table.setmetatablenewindex(privates_p,function(t,k,v) rawset(t,k,v) p_privates_p = false end)
--
-- local function escaped(str)
-- if not str or str == "" then
-- return ""
-- else
-- if not p_privates_u then
-- p_privates_u = Cs((lpeg.utfchartabletopattern(privates_u)/privates_u + p_char)^0)
-- end
-- return lpegmatch(p_privates_u,str)
-- end
-- end
--
-- local function unprivatized(str)
-- if not str or str == "" then
-- return ""
-- else
-- if not p_privates_p then
-- p_privates_p = Cs((lpeg.utfchartabletopattern(privates_p)/privates_p + p_char)^0)
-- end
-- return lpegmatch(p_privates_p,str)
-- end
-- end
local escaped = utf.remapper(privates_u,"dynamic")
local unprivatized = utf.remapper(privates_p,"dynamic")
xml.unprivatized = unprivatized
local function unescaped(s)
local p = privates_n[s]
if not p then
nofprivates = nofprivates + 1
p = utfchar(nofprivates)
privates_n[s] = p
s = "&" .. s .. ";" -- todo: use char-ent to map to hex
privates_u[p] = s
privates_p[p] = s
end
return p
end
xml.privatetoken = unescaped
xml.privatecodes = privates_n
local function handle_hex_entity(str)
local h = hcache[str]
if not h then
local n = tonumber(str,16)
h = unify_predefined and predefined_unified[n]
if h then
if trace_entities then
report_xml("utfize, converting hex entity %s; into %a",str,h)
end
elseif utfize then
h = (n and utfchar(n)) or xml.unknown_hex_entity(str) or ""
if not n then
report_xml("utfize, ignoring hex entity %s;",str)
elseif trace_entities then
report_xml("utfize, converting hex entity %s; into %a",str,h)
end
else
if trace_entities then
report_xml("found entity %s;",str)
end
h = "" .. str .. ";"
end
hcache[str] = h
end
return h
end
local function handle_dec_entity(str)
local d = dcache[str]
if not d then
local n = tonumber(str)
d = unify_predefined and predefined_unified[n]
if d then
if trace_entities then
report_xml("utfize, converting dec entity %s; into %a",str,d)
end
elseif utfize then
d = (n and utfchar(n)) or placeholders.unknown_dec_entity(str) or ""
if not n then
report_xml("utfize, ignoring dec entity %s;",str)
elseif trace_entities then
report_xml("utfize, converting dec entity %s; into %a",str,d)
end
else
if trace_entities then
report_xml("found entity %s;",str)
end
d = "" .. str .. ";"
end
dcache[str] = d
end
return d
end
xml.parsedentitylpeg = parsedentity
local function handle_any_entity(str)
if resolve then
local a = acache[str] -- per instance ! todo
if not a then
a = resolve_predefined and predefined_simplified[str]
if a then
if trace_entities then
report_xml("resolving entity &%s; to predefined %a",str,a)
end
else
if type(resolve) == "function" then
a = resolve(str) or entities[str]
else
a = entities[str]
end
if a then
if type(a) == "function" then
if trace_entities then
report_xml("expanding entity &%s; to function call",str)
end
a = a(str) or ""
end
a = lpegmatch(parsedentity,a) or a -- for nested
if trace_entities then
report_xml("resolving entity &%s; to internal %a",str,a)
end
else
local unknown_any_entity = placeholders.unknown_any_entity
if unknown_any_entity then
a = unknown_any_entity(str) or ""
end
if a then
if trace_entities then
report_xml("resolving entity &%s; to external %s",str,a)
end
else
if trace_entities then
report_xml("keeping entity &%s;",str)
end
if str == "" then
a = badentity
else
a = "&" .. str .. ";"
end
end
end
end
acache[str] = a
elseif trace_entities then
if not acache[str] then
report_xml("converting entity &%s; to %a",str,a)
acache[str] = a
end
end
return a
else
local a = acache[str]
if not a then
a = resolve_predefined and predefined_simplified[str]
if a then
-- one of the predefined
acache[str] = a
if trace_entities then
report_xml("entity &%s; becomes %a",str,a)
end
elseif str == "" then
if trace_entities then
report_xml("invalid entity &%s;",str)
end
a = badentity
acache[str] = a
else
if trace_entities then
report_xml("entity &%s; is made private",str)
end
-- a = "&" .. str .. ";"
a = unescaped(str)
acache[str] = a
end
end
return a
end
end
-- local function handle_end_entity(chr)
-- report_xml("error in entity, %a found instead of %a",chr,";")
-- end
local function handle_end_entity(str)
report_xml("error in entity, %a found without ending %a",str,";")
return str
end
local function handle_crap_error(chr)
report_xml("error in parsing, unexpected %a found ",chr)
add_text(chr)
return chr
end
local space = S(' \r\n\t')
local open = P('<')
local close = P('>')
local squote = S("'")
local dquote = S('"')
local equal = P('=')
local slash = P('/')
local colon = P(':')
local semicolon = P(';')
local ampersand = P('&')
local valid = R('az', 'AZ', '09') + S('_-.')
local name_yes = C(valid^1) * colon * C(valid^1)
local name_nop = C(P(true)) * C(valid^1)
local name = name_yes + name_nop
local utfbom = lpegpatterns.utfbom -- no capture
local spacing = C(space^0)
----- entitycontent = (1-open-semicolon)^0
local anyentitycontent = (1-open-semicolon-space-close-ampersand)^0
local hexentitycontent = R("AF","af","09")^0
local decentitycontent = R("09")^0
local parsedentity = P("#")/"" * (
P("x")/"" * (hexentitycontent/handle_hex_entity) +
(decentitycontent/handle_dec_entity)
) + (anyentitycontent/handle_any_entity)
----- entity = ampersand/"" * parsedentity * ( (semicolon/"") + #(P(1)/handle_end_entity))
local entity = (ampersand/"") * parsedentity * (semicolon/"")
+ ampersand * (anyentitycontent / handle_end_entity)
local text_unparsed = C((1-open)^1)
local text_parsed = Cs(((1-open-ampersand)^1 + entity)^1)
local somespace = space^1
local optionalspace = space^0
----- value = (squote * C((1 - squote)^0) * squote) + (dquote * C((1 - dquote)^0) * dquote) -- ampersand and < also invalid in value
local value = (squote * Cs((entity + (1 - squote))^0) * squote) + (dquote * Cs((entity + (1 - dquote))^0) * dquote) -- ampersand and < also invalid in value
local endofattributes = slash * close + close -- recovery of flacky html
local whatever = space * name * optionalspace * equal
----- wrongvalue = C(P(1-whatever-close)^1 + P(1-close)^1) / attribute_value_error
----- wrongvalue = C(P(1-whatever-endofattributes)^1 + P(1-endofattributes)^1) / attribute_value_error
----- wrongvalue = C(P(1-space-endofattributes)^1) / attribute_value_error
local wrongvalue = Cs(P(entity + (1-space-endofattributes))^1) / attribute_value_error
local attributevalue = value + wrongvalue
local attribute = (somespace * name * optionalspace * equal * optionalspace * attributevalue) / add_attribute
----- attributes = (attribute)^0
local attributes = (attribute + somespace^-1 * (((1-endofattributes)^1)/attribute_specification_error))^0
local parsedtext = text_parsed / add_text
local unparsedtext = text_unparsed / add_text
local balanced = P { "[" * ((1 - S"[]") + V(1))^0 * "]" } -- taken from lpeg manual, () example
local emptyelement = (spacing * open * name * attributes * optionalspace * slash * close) / add_empty
local beginelement = (spacing * open * name * attributes * optionalspace * close) / add_begin
local endelement = (spacing * open * slash * name * optionalspace * close) / add_end
-- todo: combine the opens in:
local begincomment = open * P("!--")
local endcomment = P("--") * close
local begininstruction = open * P("?")
local endinstruction = P("?") * close
local begincdata = open * P("![CDATA[")
local endcdata = P("]]") * close
local someinstruction = C((1 - endinstruction)^0)
local somecomment = C((1 - endcomment )^0)
local somecdata = C((1 - endcdata )^0)
local function normalentity(k,v ) entities[k] = v end
local function systementity(k,v,n) entities[k] = v end
local function publicentity(k,v,n) entities[k] = v end
-- todo: separate dtd parser
local begindoctype = open * P("!DOCTYPE")
local enddoctype = close
local beginset = P("[")
local endset = P("]")
local doctypename = C((1-somespace-close)^0)
local elementdoctype = optionalspace * P(" &
cleanup = settings.text_cleanup
entities = settings.entities or { }
--
if utfize == nil then
settings.utfize_entities = true
utfize = true
end
if resolve_predefined == nil then
settings.resolve_predefined_entities = true
resolve_predefined = true
end
--
stack, top, at, xmlns, errorstr = { }, { }, { }, { }, nil
acache, hcache, dcache = { }, { }, { } -- not stored
reported_attribute_errors = { }
if settings.parent_root then
mt = getmetatable(settings.parent_root)
else
initialize_mt(top)
end
stack[#stack+1] = top
top.dt = { }
dt = top.dt
if not data or data == "" then
errorstr = "empty xml file"
elseif utfize or resolve then
if lpegmatch(grammar_parsed_text,data) then
-- errorstr = "" can be set!
else
errorstr = "invalid xml file - parsed text"
end
elseif type(data) == "string" then
if lpegmatch(grammar_unparsed_text,data) then
errorstr = ""
else
errorstr = "invalid xml file - unparsed text"
end
else
errorstr = "invalid xml file - no text at all"
end
local result
if errorstr and errorstr ~= "" then
result = { dt = { { ns = "", tg = "error", dt = { errorstr }, at={ }, er = true } } }
setmetatable(result, mt)
setmetatable(result.dt[1], mt)
setmetatable(stack, mt)
local errorhandler = settings.error_handler
if errorhandler == false then
-- no error message
else
errorhandler = errorhandler or xml.errorhandler
if errorhandler then
local currentresource = settings.currentresource
if currentresource and currentresource ~= "" then
xml.errorhandler(formatters["load error in [%s]: %s"](currentresource,errorstr))
else
xml.errorhandler(formatters["load error: %s"](errorstr))
end
end
end
else
result = stack[1]
end
if not settings.no_root then
result = { special = true, ns = "", tg = '@rt@', dt = result.dt, at={ }, entities = entities, settings = settings }
setmetatable(result, mt)
local rdt = result.dt
for k=1,#rdt do
local v = rdt[k]
if type(v) == "table" and not v.special then -- always table -)
result.ri = k -- rootindex
v.__p__ = result -- new, experiment, else we cannot go back to settings, we need to test this !
break
end
end
end
if errorstr and errorstr ~= "" then
result.error = true
else
errorstr = nil
end
result.statistics = {
errormessage = errorstr,
entities = {
decimals = dcache,
hexadecimals = hcache,
names = acache,
}
}
strip, utfize, resolve, resolve_predefined = nil, nil, nil, nil
unify_predefined, cleanup, entities = nil, nil, nil
stack, top, at, xmlns, errorstr = nil, nil, nil, nil, nil
acache, hcache, dcache = nil, nil, nil
reported_attribute_errors, mt, errorhandler = nil, nil, nil
return result
end
-- Because we can have a crash (stack issues) with faulty xml, we wrap this one
-- in a protector:
local function xmlconvert(data,settings)
local ok, result = pcall(function() return _xmlconvert_(data,settings) end)
if ok then
return result
else
return _xmlconvert_("",settings)
end
end
xml.convert = xmlconvert
function xml.inheritedconvert(data,xmldata) -- xmldata is parent
local settings = xmldata.settings
if settings then
settings.parent_root = xmldata -- to be tested
end
-- settings.no_root = true
local xc = xmlconvert(data,settings) -- hm, we might need to locate settings
-- xc.settings = nil
-- xc.entities = nil
-- xc.special = nil
-- xc.ri = nil
-- print(xc.tg)
return xc
end
--[[ldx--
Packaging data in an xml like table is done with the following
function. Maybe it will go away (when not used).
--ldx]]--
function xml.is_valid(root)
return root and root.dt and root.dt[1] and type(root.dt[1]) == "table" and not root.dt[1].er
end
function xml.package(tag,attributes,data)
local ns, tg = match(tag,"^(.-):?([^:]+)$")
local t = { ns = ns, tg = tg, dt = data or "", at = attributes or {} }
setmetatable(t, mt)
return t
end
function xml.is_valid(root)
return root and not root.error
end
xml.errorhandler = report_xml
--[[ldx--
We cannot load an from a filehandle so we need to load
the whole file first. The function accepts a string representing
a filename or a file handle.
--ldx]]--
function xml.load(filename,settings)
local data = ""
if type(filename) == "string" then
-- local data = io.loaddata(filename) - -todo: check type in io.loaddata
local f = io.open(filename,'r') -- why not 'rb'
if f then
data = f:read("*all") -- io.readall(f) ... only makes sense for large files
f:close()
end
elseif filename then -- filehandle
data = filename:read("*all") -- io.readall(f) ... only makes sense for large files
end
if settings then
settings.currentresource = filename
local result = xmlconvert(data,settings)
settings.currentresource = nil
return result
else
return xmlconvert(data,{ currentresource = filename })
end
end
--[[ldx--
When we inject new elements, we need to convert strings to
valid trees, which is what the next function does.
--ldx]]--
local no_root = { no_root = true }
function xml.toxml(data)
if type(data) == "string" then
local root = { xmlconvert(data,no_root) }
return (#root > 1 and root) or root[1]
else
return data
end
end
--[[ldx--
For copying a tree we use a dedicated function instead of the
generic table copier. Since we know what we're dealing with we
can speed up things a bit. The second argument is not to be used!
--ldx]]--
local function copy(old,tables)
if old then
tables = tables or { }
local new = { }
if not tables[old] then
tables[old] = new
end
for k,v in next, old do
new[k] = (type(v) == "table" and (tables[v] or copy(v, tables))) or v
end
local mt = getmetatable(old)
if mt then
setmetatable(new,mt)
end
return new
else
return { }
end
end
xml.copy = copy
--[[ldx--
In serializing the tree or parts of the tree is a major
actitivity which is why the following function is pretty optimized resulting
in a few more lines of code than needed. The variant that uses the formatting
function for all components is about 15% slower than the concatinating
alternative.
--ldx]]--
-- todo: add when not present
function xml.checkbom(root) -- can be made faster
if root.ri then
local dt = root.dt
for k=1,#dt do
local v = dt[k]
if type(v) == "table" and v.special and v.tg == "@pi@" and find(v.dt[1],"xml.*version=") then
return
end
end
insert(dt, 1, { special = true, ns = "", tg = "@pi@", dt = { "xml version='1.0' standalone='yes'" } } )
insert(dt, 2, "\n" )
end
end
--[[ldx--
At the cost of some 25% runtime overhead you can first convert the tree to a string
and then handle the lot.
--ldx]]--
-- new experimental reorganized serialize
local f_attribute = formatters['%s=%q']
local function verbose_element(e,handlers,escape) -- options
local handle = handlers.handle
local serialize = handlers.serialize
local ens, etg, eat, edt, ern = e.ns, e.tg, e.at, e.dt, e.rn
local ats = eat and next(eat) and { }
if ats then
local n = 0
for k,v in next, eat do
n = n + 1
ats[n] = f_attribute(k,escaped(v))
end
end
if ern and trace_entities and ern ~= ens then
ens = ern
end
if ens ~= "" then
if edt and #edt > 0 then
if ats then
handle("<",ens,":",etg," ",concat(ats," "),">")
else
handle("<",ens,":",etg,">")
end
for i=1,#edt do
local e = edt[i]
if type(e) == "string" then
handle(escaped(e))
else
serialize(e,handlers)
end
end
handle("",ens,":",etg,">")
else
if ats then
handle("<",ens,":",etg," ",concat(ats," "),"/>")
else
handle("<",ens,":",etg,"/>")
end
end
else
if edt and #edt > 0 then
if ats then
handle("<",etg," ",concat(ats," "),">")
else
handle("<",etg,">")
end
for i=1,#edt do
local e = edt[i]
if type(e) == "string" then
handle(escaped(e)) -- option: hexify escaped entities
else
serialize(e,handlers)
end
end
handle("",etg,">")
else
if ats then
handle("<",etg," ",concat(ats," "),"/>")
else
handle("<",etg,"/>")
end
end
end
end
local function verbose_pi(e,handlers)
handlers.handle("",e.dt[1],"?>")
end
local function verbose_comment(e,handlers)
handlers.handle("")
end
local function verbose_cdata(e,handlers)
handlers.handle("")
end
local function verbose_doctype(e,handlers)
handlers.handle("")
end
local function verbose_root(e,handlers)
handlers.serialize(e.dt,handlers)
end
local function verbose_text(e,handlers)
handlers.handle(escaped(e))
end
local function verbose_document(e,handlers)
local serialize = handlers.serialize
local functions = handlers.functions
for i=1,#e do
local ei = e[i]
if type(ei) == "string" then
functions["@tx@"](ei,handlers)
else
serialize(ei,handlers)
end
end
end
local function serialize(e,handlers,...)
if e then
local initialize = handlers.initialize
local finalize = handlers.finalize
local functions = handlers.functions
if initialize then
local state = initialize(...)
if not state == true then
return state
end
end
local etg = e.tg
if etg then
(functions[etg] or functions["@el@"])(e,handlers)
-- elseif type(e) == "string" then
-- functions["@tx@"](e,handlers)
else
functions["@dc@"](e,handlers) -- dc ?
end
if finalize then
return finalize()
end
end
end
local function xserialize(e,handlers)
local functions = handlers.functions
local etg = e.tg
if etg then
(functions[etg] or functions["@el@"])(e,handlers)
-- elseif type(e) == "string" then
-- functions["@tx@"](e,handlers)
else
functions["@dc@"](e,handlers)
end
end
local handlers = { }
local function newhandlers(settings)
local t = table.copy(handlers[settings and settings.parent or "verbose"] or { }) -- merge
if settings then
for k,v in next, settings do
if type(v) == "table" then
local tk = t[k] if not tk then tk = { } t[k] = tk end
for kk,vv in next, v do
tk[kk] = vv
end
else
t[k] = v
end
end
if settings.name then
handlers[settings.name] = t
end
end
utilities.storage.mark(t)
return t
end
local nofunction = function() end
function xml.sethandlersfunction(handler,name,fnc)
handler.functions[name] = fnc or nofunction
end
function xml.gethandlersfunction(handler,name)
return handler.functions[name]
end
function xml.gethandlers(name)
return handlers[name]
end
newhandlers {
name = "verbose",
initialize = false, -- faster than nil and mt lookup
finalize = false, -- faster than nil and mt lookup
serialize = xserialize,
handle = print,
functions = {
["@dc@"] = verbose_document,
["@dt@"] = verbose_doctype,
["@rt@"] = verbose_root,
["@el@"] = verbose_element,
["@pi@"] = verbose_pi,
["@cm@"] = verbose_comment,
["@cd@"] = verbose_cdata,
["@tx@"] = verbose_text,
}
}
--[[ldx--
How you deal with saving data depends on your preferences. For a 40 MB database
file the timing on a 2.3 Core Duo are as follows (time in seconds):
1.3 : load data from file to string
6.1 : convert string into tree
5.3 : saving in file using xmlsave
6.8 : converting to string using xml.tostring
3.6 : saving converted string in file
Beware, these were timing with the old routine but measurements will not be that
much different I guess.
--ldx]]--
-- maybe this will move to lxml-xml
local result
local xmlfilehandler = newhandlers {
name = "file",
initialize = function(name)
result = io.open(name,"wb")
return result
end,
finalize = function()
result:close()
return true
end,
handle = function(...)
result:write(...)
end,
}
-- no checking on writeability here but not faster either
--
-- local xmlfilehandler = newhandlers {
-- initialize = function(name)
-- io.output(name,"wb")
-- return true
-- end,
-- finalize = function()
-- io.close()
-- return true
-- end,
-- handle = io.write,
-- }
function xml.save(root,name)
serialize(root,xmlfilehandler,name)
end
local result
local xmlstringhandler = newhandlers {
name = "string",
initialize = function()
result = { }
return result
end,
finalize = function()
return concat(result)
end,
handle = function(...)
result[#result+1] = concat { ... }
end,
}
local function xmltostring(root) -- 25% overhead due to collecting
if not root then
return ""
elseif type(root) == "string" then
return root
else -- if next(root) then -- next is faster than type (and >0 test)
return serialize(root,xmlstringhandler) or ""
end
end
local function __tostring(root) -- inline
return (root and xmltostring(root)) or ""
end
initialize_mt = function(root) -- redefinition
mt = { __tostring = __tostring, __index = root }
end
xml.defaulthandlers = handlers
xml.newhandlers = newhandlers
xml.serialize = serialize
xml.tostring = xmltostring
--[[ldx--
The next function operated on the content only and needs a handle function
that accepts a string.
--ldx]]--
local function xmlstring(e,handle)
if not handle or (e.special and e.tg ~= "@rt@") then
-- nothing
elseif e.tg then
local edt = e.dt
if edt then
for i=1,#edt do
xmlstring(edt[i],handle)
end
end
else
handle(e)
end
end
xml.string = xmlstring
--[[ldx--
A few helpers:
--ldx]]--
--~ xmlsetproperty(root,"settings",settings)
function xml.settings(e)
while e do
local s = e.settings
if s then
return s
else
e = e.__p__
end
end
return nil
end
function xml.root(e)
local r = e
while e do
e = e.__p__
if e then
r = e
end
end
return r
end
function xml.parent(root)
return root.__p__
end
function xml.body(root)
return root.ri and root.dt[root.ri] or root -- not ok yet
end
function xml.name(root)
if not root then
return ""
end
local ns = root.ns
local tg = root.tg
if ns == "" then
return tg
else
return ns .. ":" .. tg
end
end
--[[ldx--
The next helper erases an element but keeps the table as it is,
and since empty strings are not serialized (effectively) it does
not harm. Copying the table would take more time. Usage:
--ldx]]--
function xml.erase(dt,k)
if dt then
if k then
dt[k] = ""
else for k=1,#dt do
dt[1] = { "" }
end end
end
end
--[[ldx--
The next helper assigns a tree (or string). Usage:
dt[k] = xml.assign(root) or xml.assign(dt,k,root)
--ldx]]--
function xml.assign(dt,k,root)
if dt and k then
dt[k] = type(root) == "table" and xml.body(root) or root
return dt[k]
else
return xml.body(root)
end
end
-- the following helpers may move
--[[ldx--
The next helper assigns a tree (or string). Usage:
xml.tocdata(e)
xml.tocdata(e,"error")
--ldx]]--
function xml.tocdata(e,wrapper) -- a few more in the aux module
local whatever = type(e) == "table" and xmltostring(e.dt) or e or ""
if wrapper then
whatever = formatters["<%s>%s%s>"](wrapper,whatever,wrapper)
end
local t = { special = true, ns = "", tg = "@cd@", at = { }, rn = "", dt = { whatever }, __p__ = e }
setmetatable(t,getmetatable(e))
e.dt = { t }
end
function xml.makestandalone(root)
if root.ri then
local dt = root.dt
for k=1,#dt do
local v = dt[k]
if type(v) == "table" and v.special and v.tg == "@pi@" then
local txt = v.dt[1]
if find(txt,"xml.*version=") then
v.dt[1] = txt .. " standalone='yes'"
break
end
end
end
end
return root
end
function xml.kind(e)
local dt = e and e.dt
if dt then
local n = #dt
if n == 1 then
local d = dt[1]
if d.special then
local tg = d.tg
if tg == "@cd@" then
return "cdata"
elseif tg == "@cm" then
return "comment"
elseif tg == "@pi@" then
return "instruction"
elseif tg == "@dt@" then
return "declaration"
end
elseif type(d) == "string" then
return "text"
end
return "element"
elseif n > 0 then
return "mixed"
end
end
return "empty"
end