# # ElementTree # $Id: TidyTools.py 1862 2004-06-18 07:31:02Z Fredrik $ # # tools to run the "tidy" command on an HTML or XHTML file, and return # the contents as an XHTML element tree. # # history: # 2002-10-19 fl added to ElementTree library; added getzonebody function # # Copyright (c) 1999-2004 by Fredrik Lundh. All rights reserved. # # fredrik@pythonware.com # http://www.pythonware.com # ## # Tools to build element trees from HTML, using the external tidy # utility. ## import glob, string, os, sys from ElementTree import ElementTree, Element NS_XHTML = "{http://www.w3.org/1999/xhtml}" ## # Convert an HTML or HTML-like file to XHTML, using the tidy # command line utility. # # @param file Filename. # @param new_inline_tags An optional list of valid but non-standard # inline tags. # @return An element tree, or None if not successful. def tidy(file, new_inline_tags=None): command = ["tidy", "-qn", "-asxml"] if new_inline_tags: command.append("--new-inline-tags") command.append(string.join(new_inline_tags, ",")) # FIXME: support more tidy options! # convert os.system( "%s %s >%s.out 2>%s.err" % (string.join(command), file, file, file) ) # check that the result is valid XML try: tree = ElementTree() tree.parse(file + ".out") except: print "*** %s:%s" % sys.exc_info()[:2] print ("*** %s is not valid XML " "(check %s.err for info)" % (file, file)) tree = None else: if os.path.isfile(file + ".out"): os.remove(file + ".out") if os.path.isfile(file + ".err"): os.remove(file + ".err") return tree ## # Get document body from a an HTML or HTML-like file. This function # uses the tidy function to convert HTML to XHTML, and cleans # up the resulting XML tree. # # @param file Filename. # @return A body element, or None if not successful. def getbody(file, **options): # get clean body from text file # get xhtml tree try: tree = apply(tidy, (file,), options) if tree is None: return except IOError, v: print "***", v return None NS = NS_XHTML # remove namespace uris for node in tree.getiterator(): if node.tag.startswith(NS): node.tag = node.tag[len(NS):] body = tree.getroot().find("body") return body ## # Same as getbody, but turns plain text at the start of the # document into an H1 tag. This function can be used to parse zone # documents. # # @param file Filename. # @return A body element, or None if not successful. def getzonebody(file, **options): body = getbody(file, **options) if body is None: return if body.text and string.strip(body.text): title = Element("h1") title.text = string.strip(body.text) title.tail = "\n\n" body.insert(0, title) body.text = None return body if __name__ == "__main__": import sys for arg in sys.argv[1:]: for file in glob.glob(arg): print file, "...", tidy(file)