"That's easy, I can use regular expressions!"
No, you can't.
"This is tree data, I'll take the DOM!"
"This is tree data, I'll take the DOM!"
=> write verbose, redundant, hard-to-maintain code
"SAX is so fast and consumes no memory!"
"SAX is so fast and consumes no memory!"
=> write confusing state-machine code
=> debugging into existence
Getting XML work done
(instead of getting time wasted)
... then »lxml« is your friend!
why »lxml takes all the pain out of XML«
(a quick overview of lxml features and ElementTree concepts)
uses Clark notation:
>>> tag = "{http://www.w3.org/the/namespace}tagname"
>>> element = etree.Element(tag)
no prefixes!
a single, self-containing tag identifier
uses .text and .tail attributes:
>>> div = html.fragment_fromstring(
... "<div><p>a paragraph<br>split in two</p> parts</div>")
>>> p = div[0]
>>> br = p[0]
>>> p.text
'a paragraph'
>>> br.text
>>> br.tail
'split in two'
>>> p.tail
' parts'
no text nodes!
uses .get() and .set() methods:
>>> root = etree.fromstring(
... '<root a="the value" b="of an" c="attribute"/>')
>>> root.get('a')
'the value'
>>> root.set('a', "THE value")
>>> root.get('a')
'THE value'
or the .attrib dictionary property:
>>> d = root.attrib
>>> list(sorted(d.keys()))
['a', 'b', 'c']
>>> list(sorted(d.values()))
['THE value', 'attribute', 'of an']
>>> root = etree.fromstring(
... "<root> <a><b/><b/></a> <c><d/><e><f/></e><g/></c> </root>")
>>> print([child.tag for child in root]) # children
['a', 'c']
>>> print([el.tag for el in root.iter()]) # self and descendants
['root', 'a', 'b', 'b', 'c', 'd', 'e', 'f', 'g']
>>> print([el.tag for el in root.iterdescendants()])
['a', 'b', 'b', 'c', 'd', 'e', 'f', 'g']
>>> def iter_breadth_first(root):
... bfs_queue = collections.deque([root])
... while bfs_queue:
... el = bfs_queue.popleft() # pop next element
... bfs_queue.extend(el) # append its children
... yield el
>>> print([el.tag for el in iter_breadth_first(root)])
['root', 'a', 'c', 'b', 'b', 'd', 'e', 'g', 'f']
>>> root = etree.fromstring(
... "<root> <a><b/><b/></a> <c><d/><e><f/></e><g/></c> </root>")
>>> tree_walker = etree.iterwalk(root, events=('start', 'end'))
>>> for (event, element) in tree_walker:
... print("%s (%s)" % (element.tag, event))
root (start)
a (start)
b (start)
b (end)
b (start)
b (end)
a (end)
c (start)
d (start)
d (end)
e (start)
f (start)
f (end)
e (end)
g (start)
g (end)
c (end)
root (end)
<root>
<speech class='dialog'><p>So be it!</p></speech>
<p>stuff</p>
</root>
search it with XPath
>>> find_paragraphs = etree.XPath("//p")
>>> paragraphs = find_paragraphs(xml_tree)
>>> print([ p.text for p in paragraphs ])
['So be it!', 'stuff']
search it with CSS selectors
>>> find_dialogs = cssselect.CSSSelector("speech.dialog p")
>>> paragraphs = find_dialogs(xml_tree)
>>> print([ p.text for p in paragraphs ])
['So be it!']
The input side
(a quick overview)
(parsing from strings and filenames/URLs frees the GIL)
using the fromstring() function:
>>> root_element = etree.fromstring(some_xml_data)
using the fromstring() function with a specific parser:
>>> parser = etree.HTMLParser(remove_comments=True)
>>> root_element = etree.fromstring(some_html_data, parser)
or the XML() and HTML() aliases for literals in code:
>>> root_element = etree.XML("<root><child/></root>")
>>> root_element = etree.HTML("<p>some<br>paragraph</p>")
The output side
(and how to make it safe and simple)
The Atom XML format
The ElementMaker (or E-factory)
>>> from lxml.builder import ElementMaker
>>> A = ElementMaker(namespace="http://www.w3.org/2005/Atom",
... nsmap={None : "http://www.w3.org/2005/Atom"})
>>> atom = A.feed(
... A.author( A.name("Stefan Behnel") ),
... A.entry(
... A.title("News from lxml"),
... A.link(href="http://codespeak.net/lxml/"),
... A.summary("See what's <b>fun</b> about lxml...",
... type="html"),
... )
... )
>>> from lxml.etree import tostring
>>> print( tostring(atom, pretty_print=True) )
>>> atom = A.feed(
... A.author( A.name("Stefan Behnel") ),
... A.entry(
... A.title("News from lxml"),
... A.link(href="http://codespeak.net/lxml/"),
... A.summary("See what's <b>fun</b> about lxml...",
... type="html"),
... )
... )
<feed xmlns="http://www.w3.org/2005/Atom">
<author>
<name>Stefan Behnel</name>
</author>
<entry>
<title>News from lxml</title>
<link href="http://codespeak.net/lxml/"/>
<summary type="html">See what's <b>fun</b>
about lxml...</summary>
</entry>
</feed>
>>> atom = A.feed(
... A.author( A.name("Stefan Behnel") ),
... A.entry(
... A.titel("News from lxml"),
... A.link(href="http://codespeak.net/lxml/"),
... A.summary("See what's <b>fun</b> about lxml...",
... type="html"),
... )
... )
<feed xmlns="http://www.w3.org/2005/Atom">
<author>
<name>Stefan Behnel</name>
</author>
<entry>
<titel>News from lxml</titel>
<link href="http://codespeak.net/lxml/"/>
<summary type="html">See what's <b>fun</b>
about lxml...</summary>
</entry>
</feed>
Write an XML generator module instead:
# atomgen.py
from lxml import etree
from lxml.builder import ElementMaker
ATOM_NAMESPACE = "http://www.w3.org/2005/Atom"
A = ElementMaker(namespace=ATOM_NAMESPACE,
nsmap={None : ATOM_NAMESPACE})
feed = A.feed
entry = A.entry
title = A.title
# ... and so on and so forth ...
# plus a little validation function: isvalid()
isvalid = etree.RelaxNG(file="atom.rng")
>>> import atomgen as A
>>> atom = A.feed(
... A.author( A.name("Stefan Behnel") ),
... A.entry(
... A.link(href="http://codespeak.net/lxml/"),
... A.title("News from lxml"),
... A.summary("See what's <b>fun</b> about lxml...",
... type="html"),
... )
... )
>>> A.isvalid(atom) # ok, forgot the ID's => invalid XML ...
False
>>> title = A.titel("News from lxml")
Traceback (most recent call last):
...
AttributeError: 'module' object has no attribute 'titel'
Atom can embed serialised HTML
>>> import lxml.html.builder as h
>>> html_fragment = h.DIV(
... "this is some\n",
... h.A("HTML", href="http://w3.org/MarkUp/"),
... "\ncontent")
>>> serialised_html = etree.tostring(html_fragment, method="html")
>>> summary = A.summary(serialised_html, type="html")
>>> print(etree.tostring(summary))
<summary xmlns="http://www.w3.org/2005/Atom" type="html">
<div>this is some
<a href="http://w3.org/MarkUp/">HTML</a>
content</div>
</summary>
Atom can also embed non-escaped XHTML
>>> from copy import deepcopy
>>> xhtml_fragment = deepcopy(html_fragment)
>>> from lxml.html import html_to_xhtml
>>> html_to_xhtml(xhtml_fragment)
>>> summary = A.summary(xhtml_fragment, type="xhtml")
>>> print(etree.tostring(summary, pretty_print=True))
<summary xmlns="http://www.w3.org/2005/Atom" type="xhtml">
<html:div xmlns:html="http://www.w3.org/1999/xhtml">this is some
<html:a href="http://w3.org/MarkUp/">HTML</html:a>
content</html:div>
</summary>
... this is all you need for the output side of XML languages
The Element API
(and how to make it the way you want)
define a subclass of ElementBase
>>> class HonkElement(etree.ElementBase):
... @property
... def honking(self):
... return self.get('honking') == 'true'
let it replace the default Element class
>>> lookup = etree.ElementDefaultClassLookup(
... element=HonkElement)
>>> parser = etree.XMLParser()
>>> parser.set_element_class_lookup(lookup)
use the new Element class
>>> root = etree.XML('<root><honk honking="true"/></root>',
... parser)
>>> root.honking
False
>>> root[0].honking
True
a feed is a container for entries
# atom.py
ATOM_NAMESPACE = "http://www.w3.org/2005/Atom"
_ATOM_NS = "{%s}" % ATOM_NAMESPACE
class FeedElement(etree.ElementBase):
@property
def entries(self):
return self.findall(_ATOM_NS + "entry")
it also has a couple of meta-data children, e.g. title
class FeedElement(etree.ElementBase):
# ...
@property
def title(self):
"return the title or None"
return self.find("title")
>>> from lxml import objectify
>>> feed = objectify.parse("atom-example.xml")
>>> print(feed.title)
Example Feed
>>> print([entry.title for entry in feed.entry])
['Atom-Powered Robots Run Amok']
>>> print(feed.entry[0].title)
Atom-Powered Robots Run Amok
from itertools import chain
class FeedElement(objectify.ObjectifiedElement):
def addIDs(self):
"initialise the IDs of feed and entries"
for element in chain([self], self.entry):
if element.find(_ATOM_NS + "id") is None:
id = etree.SubElement(self, _ATOM_NS + "id")
id.text = make_guid()
Atom has a namespace => leave the mapping to lxml
# ...
_atom_lookup = etree.ElementNamespaceClassLookup(
objectify.ObjectifyElementClassLookup())
# map the classes to tag names
ns = _atom_lookup.get_namespace(ATOM_NAMESPACE)
ns["feed"] = FeedElement
ns["entry"] = EntryElement
# ... and so on
# or use ns.update(vars()) with appropriate class names
# create a parser that does some whitespace cleanup
atom_parser = etree.XMLParser(remove_blank_text=True)
# make it use our Atom classes
atom_parser.set_element_class_lookup(_atom_lookup)
# and help users in using our parser setup
def parse(input):
return etree.parse(input, atom_parser)
>>> import atom
>>> feed = atom.parse("ep2008/atom-example.xml").getroot()
>>> print(len(feed.entry))
1
>>> print([entry.title for entry in feed.entry])
['Atom-Powered Robots Run Amok']
>>> link_tag = "{%s}link" % atom.ATOM_NAMESPACE
>>> print([link.get("href") for link in feed.iter(link_tag)])
['http://example.org/', 'http://example.org/2003/12/13/atom03']
To implement an XML API ...
lxml ...