# # ElementTree # $Id: HTMLTreeBuilder.py 2325 2005-03-16 15:50:43Z fredrik $ # # a simple tree builder, for HTML input # # history: # 2002-04-06 fl created # 2002-04-07 fl ignore IMG and HR end tags # 2002-04-07 fl added support for 1.5.2 and later # 2003-04-13 fl added HTMLTreeBuilder alias # 2004-12-02 fl don't feed non-ASCII charrefs/entities as 8-bit strings # 2004-12-05 fl don't feed non-ASCII CDATA as 8-bit strings # # Copyright (c) 1999-2004 by Fredrik Lundh. All rights reserved. # # fredrik@pythonware.com # http://www.pythonware.com # # -------------------------------------------------------------------- # The ElementTree toolkit is # # Copyright (c) 1999-2004 by Fredrik Lundh # # By obtaining, using, and/or copying this software and/or its # associated documentation, you agree that you have read, understood, # and will comply with the following terms and conditions: # # Permission to use, copy, modify, and distribute this software and # its associated documentation for any purpose and without fee is # hereby granted, provided that the above copyright notice appears in # all copies, and that both that copyright notice and this permission # notice appear in supporting documentation, and that the name of # Secret Labs AB or the author not be used in advertising or publicity # pertaining to distribution of the software without specific, written # prior permission. # # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT- # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE # OF THIS SOFTWARE. # -------------------------------------------------------------------- ## # Tools to build element trees from HTML files. ## import htmlentitydefs import re, string, sys import mimetools, StringIO import ElementTree AUTOCLOSE = "p", "li", "tr", "th", "td", "head", "body" IGNOREEND = "img", "hr", "meta", "link", "br" if sys.version[:3] == "1.5": is_not_ascii = re.compile(r"[\x80-\xff]").search # 1.5.2 else: is_not_ascii = re.compile(eval(r'u"[\u0080-\uffff]"')).search try: from HTMLParser import HTMLParser except ImportError: from sgmllib import SGMLParser # hack to use sgmllib's SGMLParser to emulate 2.2's HTMLParser class HTMLParser(SGMLParser): # the following only works as long as this class doesn't # provide any do, start, or end handlers def unknown_starttag(self, tag, attrs): self.handle_starttag(tag, attrs) def unknown_endtag(self, tag): self.handle_endtag(tag) ## # ElementTree builder for HTML source code. This builder converts an # HTML document or fragment to an ElementTree. #

# The parser is relatively picky, and requires balanced tags for most # elements. However, elements belonging to the following group are # automatically closed: P, LI, TR, TH, and TD. In addition, the # parser automatically inserts end tags immediately after the start # tag, and ignores any end tags for the following group: IMG, HR, # META, and LINK. # # @keyparam builder Optional builder object. If omitted, the parser # uses the standard elementtree builder. # @keyparam encoding Optional character encoding, if known. If omitted, # the parser looks for META tags inside the document. If no tags # are found, the parser defaults to ISO-8859-1. Note that if your # document uses a non-ASCII compatible encoding, you must decode # the document before parsing. # # @see elementtree.ElementTree class HTMLTreeBuilder(HTMLParser): # FIXME: shouldn't this class be named Parser, not Builder? def __init__(self, builder=None, encoding=None): self.__stack = [] if builder is None: builder = ElementTree.TreeBuilder() self.__builder = builder self.encoding = encoding or "iso-8859-1" HTMLParser.__init__(self) ## # Flushes parser buffers, and return the root element. # # @return An Element instance. def close(self): HTMLParser.close(self) return self.__builder.close() ## # (Internal) Handles start tags. def handle_starttag(self, tag, attrs): if tag == "meta": # look for encoding directives http_equiv = content = None for k, v in attrs: if k == "http-equiv": http_equiv = string.lower(v) elif k == "content": content = v if http_equiv == "content-type" and content: # use mimetools to parse the http header header = mimetools.Message( StringIO.StringIO("%s: %s\n\n" % (http_equiv, content)) ) encoding = header.getparam("charset") if encoding: self.encoding = encoding if tag in AUTOCLOSE: if self.__stack and self.__stack[-1] == tag: self.handle_endtag(tag) self.__stack.append(tag) attrib = {} if attrs: for k, v in attrs: attrib[string.lower(k)] = v self.__builder.start(tag, attrib) if tag in IGNOREEND: self.__stack.pop() self.__builder.end(tag) ## # (Internal) Handles end tags. def handle_endtag(self, tag): if tag in IGNOREEND: return lasttag = self.__stack.pop() if tag != lasttag and lasttag in AUTOCLOSE: self.handle_endtag(lasttag) self.__builder.end(tag) ## # (Internal) Handles character references. def handle_charref(self, char): if char[:1] == "x": char = int(char[1:], 16) else: char = int(char) if 0 <= char < 128: self.__builder.data(chr(char)) else: self.__builder.data(unichr(char)) ## # (Internal) Handles entity references. def handle_entityref(self, name): entity = htmlentitydefs.entitydefs.get(name) if entity: if len(entity) == 1: entity = ord(entity) else: entity = int(entity[2:-1]) if 0 <= entity < 128: self.__builder.data(chr(entity)) else: self.__builder.data(unichr(entity)) else: self.unknown_entityref(name) ## # (Internal) Handles character data. def handle_data(self, data): if isinstance(data, type('')) and is_not_ascii(data): # convert to unicode, but only if necessary data = unicode(data, self.encoding, "ignore") self.__builder.data(data) ## # (Hook) Handles unknown entity references. The default action # is to ignore unknown entities. def unknown_entityref(self, name): pass # ignore by default; override if necessary ## # An alias for the HTMLTreeBuilder class. TreeBuilder = HTMLTreeBuilder ## # Parse an HTML document or document fragment. # # @param source A filename or file object containing HTML data. # @param encoding Optional character encoding, if known. If omitted, # the parser looks for META tags inside the document. If no tags # are found, the parser defaults to ISO-8859-1. # @return An ElementTree instance def parse(source, encoding=None): return ElementTree.parse(source, HTMLTreeBuilder(encoding=encoding)) if __name__ == "__main__": import sys ElementTree.dump(parse(open(sys.argv[1])))