"""A simple "pull API" for HTML parsing, after Perl's HTML::TokeParser. Examples This program extracts all links from a document. It will print one line for each link, containing the URL and the textual description between the ... tags: import pullparser, sys f = file(sys.argv[1]) p = pullparser.PullParser(f) for token in p.tags("a"): if token.type == "endtag": continue url = dict(token.attrs).get("href", "-") text = p.get_compressed_text(endat=("endtag", "a")) print "%s\t%s" % (url, text) This program extracts the from the document: import pullparser, sys f = file(sys.argv[1]) p = pullparser.PullParser(f) if p.get_tag("title"): title = p.get_compressed_text() print "Title: %s" % title Copyright 2003-2004 John J. Lee <jjl@pobox.com> Copyright 1998-2001 Gisle Aas (original libwww-perl code) This code is free software; you can redistribute it and/or modify it under the terms of the BSD License. """ from __future__ import generators import re, htmlentitydefs import HTMLParser __version__ = (0, 0, 6, None, None) # 0.0.6b class NoMoreTokensError(Exception): pass class Token: """Represents an HTML tag, declaration, processing instruction etc. Behaves as both a tuple-like object (ie. iterable) and has attributes .type, .data and .attrs. >>> t = Token("starttag", "a", [("href", "http://www.python.org/")]) >>> t == ("starttag", "a", [("href", "http://www.python.org/")]) True >>> t.type, t.data == "starttag", "a" True >>> t.attrs == [("href", "http://www.python.org/")] True Public attributes type: one of "starttag", "endtag", "startendtag", "charref", "entityref", "data", "comment", "decl", "pi", after the corresponding methods of HTMLParser.HTMLParser data: For a tag, the tag name; otherwise, the relevant data carried by the tag, as a string attrs: list of (name, value) pairs representing HTML attributes (or None if token does not represent an opening tag) """ def __init__(self, type, data, attrs=None): self.type = type self.data = data self.attrs = attrs def __iter__(self): return iter((self.type, self.data, self.attrs)) def __eq__(self, other): type, data, attrs = other if (self.type == type and self.data == data and self.attrs == attrs): return True else: return False def __ne__(self, other): return not self.__eq__(other) def __repr__(self): args = ", ".join(map(repr, [self.type, self.data, self.attrs])) return self.__class__.__name__+"(%s)" % args def iter_until_exception(fn, exception, *args, **kwds): while 1: try: yield fn(*args, **kwds) except exception: raise StopIteration def caller(): try: raise SyntaxError except: import sys return sys.exc_traceback.tb_frame.f_back.f_back.f_code.co_name def unescape(data, entities): if data is None or '&' not in data: return data def replace_entities(match): ent = match.group() repl = entities.get(ent, ent) return repl return re.sub(r'&\S+?;', replace_entities, data) def get_entitydefs(): entitydefs = {} for name, char in htmlentitydefs.entitydefs.items(): entitydefs["&%s;" % name] = char return entitydefs class _AbstractParser: chunk = 1024 compress_re = re.compile(r"\s+") entitydefs = htmlentitydefs.entitydefs def __init__(self, fh, textify={"img": "alt", "applet": "alt"}, encoding="ascii", entitydefs=None): """ fh: file-like object (only a .read() method is required) from which to read HTML to be parsed textify: mapping used by .get_text() and .get_compressed_text() methods to represent opening tags as text encoding: encoding used to encode numeric character references by .get_text() and .get_compressed_text() ("ascii" by default) entitydefs: mapping like {'&': '&', ...} containing HTML entity definitions (a sensible default is used) If the element name of an opening tag matches a key in the textify mapping then that tag is converted to text. The corresponding value is used to specify which tag attribute to obtain the text from. textify maps from element names to either: - an HTML attribute name, in which case the HTML attribute value is used as its text value along with the element name in square brackets (eg."alt text goes here[IMG]", or, if the alt attribute were missing, just "[IMG]") - a callable object (eg. a function) which takes a Token and returns the string to be used as its text value If textify has no key for an element name, nothing is substituted for the opening tag. Public attributes: encoding and textify: see above """ self._fh = fh self._tokenstack = [] # FIFO self.textify = textify self.encoding = encoding if entitydefs is None: entitydefs = get_entitydefs() self._entitydefs = entitydefs def __iter__(self): return self def tags(self, *names): return iter_until_exception(self.get_tag, NoMoreTokensError, *names) def tokens(self, *tokentypes): return iter_until_exception(self.get_token, NoMoreTokensError, *tokentypes) def next(self): try: return self.get_token() except NoMoreTokensError: raise StopIteration() def get_token(self, *tokentypes): """Pop the next Token object from the stack of parsed tokens. If arguments are given, they are taken to be token types in which the caller is interested: tokens representing other elements will be skipped. Element names must be given in lower case. Raises NoMoreTokensError. """ while 1: while self._tokenstack: token = self._tokenstack.pop(0) if tokentypes: if token.type in tokentypes: return token else: return token data = self._fh.read(self.chunk) if not data: raise NoMoreTokensError() self.feed(data) def unget_token(self, token): """Push a Token back onto the stack.""" self._tokenstack.insert(0, token) def get_tag(self, *names): """Return the next Token that represents an opening or closing tag. If arguments are given, they are taken to be element names in which the caller is interested: tags representing other elements will be skipped. Element names must be given in lower case. Raises NoMoreTokensError. """ while 1: tok = self.get_token() if tok.type not in ["starttag", "endtag", "startendtag"]: continue if names: if tok.data in names: return tok else: return tok def get_text(self, endat=None): """Get some text. endat: stop reading text at this tag (the tag is included in the returned text); endtag is a tuple (type, name) where type is "starttag", "endtag" or "startendtag", and name is the element name of the tag (element names must be given in lower case) If endat is not given, .get_text() will stop at the next opening or closing tag, or when there are no more tokens (no exception is raised). Note that .get_text() includes the text representation (if any) of the opening tag, but pushes the opening tag back onto the stack. As a result, if you want to call .get_text() again, you need to call .get_tag() first (unless you want an empty string returned when you next call .get_text()). Entity references are translated using the entitydefs attribute (a mapping from names to characters like that provided by the standard module htmlentitydefs). Named entity references that are not in this mapping are left unchanged. The textify attribute is used to translate opening tags into text: see the class docstring. """ text = [] tok = None while 1: try: tok = self.get_token() except NoMoreTokensError: # unget last token (not the one we just failed to get) if tok: self.unget_token(tok) break if tok.type == "data": text.append(tok.data) elif tok.type == "entityref": name = tok.data if name in self.entitydefs: t = self.entitydefs[name] else: t = "&%s;" % name text.append(t) elif tok.type == "charref": name, base = tok.data, 10 if name.startswith('x'): name, base= name[1:], 16 t = unichr(int(name, base)).encode(self.encoding) text.append(t) elif tok.type in ["starttag", "endtag", "startendtag"]: tag_name = tok.data if tok.type in ["starttag", "startendtag"]: alt = self.textify.get(tag_name) if alt is not None: if callable(alt): text.append(alt(tok)) elif tok.attrs is not None: for k, v in tok.attrs: if k == alt: text.append(v) text.append("[%s]" % tag_name.upper()) if endat is None or endat == (tok.type, tag_name): self.unget_token(tok) break return "".join(text) def get_compressed_text(self, *args, **kwds): """ As .get_text(), but collapses each group of contiguous whitespace to a single space character, and removes all initial and trailing whitespace. """ text = self.get_text(*args, **kwds) text = text.strip() return self.compress_re.sub(" ", text) def handle_startendtag(self, tag, attrs): self._tokenstack.append(Token("startendtag", tag, attrs)) def handle_starttag(self, tag, attrs): self._tokenstack.append(Token("starttag", tag, attrs)) def handle_endtag(self, tag): self._tokenstack.append(Token("endtag", tag)) def handle_charref(self, name): self._tokenstack.append(Token("charref", name)) def handle_entityref(self, name): self._tokenstack.append(Token("entityref", name)) def handle_data(self, data): self._tokenstack.append(Token("data", data)) def handle_comment(self, data): self._tokenstack.append(Token("comment", data)) def handle_decl(self, decl): self._tokenstack.append(Token("decl", decl)) def unknown_decl(self, data): # XXX should this call self.error instead? #self.error("unknown declaration: " + `data`) self._tokenstack.append(Token("decl", data)) def handle_pi(self, data): self._tokenstack.append(Token("pi", data)) def unescape_attr(self, name): return unescape(name, self._entitydefs) def unescape_attrs(self, attrs): escaped_attrs = [] for key, val in attrs: escaped_attrs.append((key, self.unescape_attr(val))) return escaped_attrs class PullParser(_AbstractParser, HTMLParser.HTMLParser): def __init__(self, *args, **kwds): HTMLParser.HTMLParser.__init__(self) _AbstractParser.__init__(self, *args, **kwds) def unescape(self, name): # Use the entitydefs passed into constructor, not # HTMLParser.HTMLParser's entitydefs. return self.unescape_attr(name) import sgmllib class TolerantPullParser(_AbstractParser, sgmllib.SGMLParser): def __init__(self, *args, **kwds): sgmllib.SGMLParser.__init__(self) _AbstractParser.__init__(self, *args, **kwds) def unknown_starttag(self, tag, attrs): attrs = self.unescape_attrs(attrs) self._tokenstack.append(Token("starttag", tag, attrs)) def unknown_endtag(self, tag): self._tokenstack.append(Token("endtag", tag))