# Author: David Goodger
# Contact: goodger@users.sourceforge.net
# Revision: $Revision: 3367 $
# Date: $Date: 2005-05-26 02:44:13 +0200 (Thu, 26 May 2005) $
# Copyright: This module has been placed in the public domain.
"""
Simple HyperText Markup Language document tree Writer.
The output conforms to the HTML 4.01 Transitional DTD and to the Extensible
HTML version 1.0 Transitional DTD (*almost* strict). The output contains a
minimum of formatting information. A cascading style sheet ("default.css" by
default) is required for proper viewing with a modern graphical browser.
"""
__docformat__ = 'reStructuredText'
import sys
import os
import os.path
import time
import re
from types import ListType
try:
import Image # check for the Python Imaging Library
except ImportError:
Image = None
import docutils
from docutils import frontend, nodes, utils, writers, languages
class Writer(writers.Writer):
supported = ('html', 'html4css1', 'xhtml')
"""Formats this writer supports."""
settings_spec = (
'HTML-Specific Options',
None,
(('Specify a stylesheet URL, used verbatim. Default is '
'"default.css". Overrides --stylesheet-path.',
['--stylesheet'],
{'default': 'default.css', 'metavar': '',
'overrides': 'stylesheet_path'}),
('Specify a stylesheet file, relative to the current working '
'directory. The path is adjusted relative to the output HTML '
'file. Overrides --stylesheet.',
['--stylesheet-path'],
{'metavar': '', 'overrides': 'stylesheet'}),
('Link to the stylesheet in the output HTML file. This is the '
'default.',
['--link-stylesheet'],
{'dest': 'embed_stylesheet', 'action': 'store_false',
'validator': frontend.validate_boolean}),
('Embed the stylesheet in the output HTML file. The stylesheet '
'file must be accessible during processing (--stylesheet-path is '
'recommended). Default: link the stylesheet, do not embed it.',
['--embed-stylesheet'],
{'action': 'store_true', 'validator': frontend.validate_boolean}),
('Specify the initial header level. Default is 1 for "
". '
'Does not affect document title & subtitle (see --no-doc-title).',
['--initial-header-level'],
{'choices': '1 2 3 4 5 6'.split(), 'default': '1',
'metavar': ''}),
('Specify the maximum width (in characters) for one-column field '
'names. Longer field names will span an entire row of the table '
'used to render the field list. Default is 14 characters. '
'Use 0 for "no limit".',
['--field-name-limit'],
{'default': 14, 'metavar': '',
'validator': frontend.validate_nonnegative_int}),
('Specify the maximum width (in characters) for options in option '
'lists. Longer options will span an entire row of the table used '
'to render the option list. Default is 14 characters. '
'Use 0 for "no limit".',
['--option-limit'],
{'default': 14, 'metavar': '',
'validator': frontend.validate_nonnegative_int}),
('Format for footnote references: one of "superscript" or '
'"brackets". Default is "brackets".',
['--footnote-references'],
{'choices': ['superscript', 'brackets'], 'default': 'brackets',
'metavar': '',
'overrides': 'trim_footnote_reference_space'}),
('Format for block quote attributions: one of "dash" (em-dash '
'prefix), "parentheses"/"parens", or "none". Default is "dash".',
['--attribution'],
{'choices': ['dash', 'parentheses', 'parens', 'none'],
'default': 'dash', 'metavar': ''}),
('Remove extra vertical whitespace between items of bullet lists '
'and enumerated lists, when list items are "simple" (i.e., all '
'items each contain one paragraph and/or one "simple" sublist '
'only). Default: enabled.',
['--compact-lists'],
{'default': 1, 'action': 'store_true',
'validator': frontend.validate_boolean}),
('Disable compact simple bullet and enumerated lists.',
['--no-compact-lists'],
{'dest': 'compact_lists', 'action': 'store_false'}),
('Omit the XML declaration. Use with caution.',
['--no-xml-declaration'],
{'dest': 'xml_declaration', 'default': 1, 'action': 'store_false',
'validator': frontend.validate_boolean}),
('Scramble email addresses to confuse harvesters. '
'For example, "abc@example.org" will become '
'``abc at example dot org``.',
['--cloak-email-addresses'],
{'action': 'store_true', 'validator': frontend.validate_boolean}),))
relative_path_settings = ('stylesheet_path',)
config_section = 'html4css1 writer'
config_section_dependencies = ('writers',)
def __init__(self):
writers.Writer.__init__(self)
self.translator_class = HTMLTranslator
def translate(self):
self.visitor = visitor = self.translator_class(self.document)
self.document.walkabout(visitor)
self.output = visitor.astext()
for attr in ('head_prefix', 'stylesheet', 'head', 'body_prefix',
'body_pre_docinfo', 'docinfo', 'body', 'fragment',
'body_suffix'):
setattr(self, attr, getattr(visitor, attr))
def assemble_parts(self):
writers.Writer.assemble_parts(self)
for part in ('title', 'subtitle', 'docinfo', 'body', 'header',
'footer', 'meta', 'stylesheet', 'fragment',
'html_prolog', 'html_head', 'html_title', 'html_subtitle',
'html_body'):
self.parts[part] = ''.join(getattr(self.visitor, part))
class HTMLTranslator(nodes.NodeVisitor):
"""
This HTML writer has been optimized to produce visually compact
lists (less vertical whitespace). HTML's mixed content models
allow list items to contain "
body elements
" or
"
just text
" or even "
text
and body
elements
combined
", each with different effects. It would
be best to stick with strict body elements in list items, but they
affect vertical spacing in browsers (although they really
shouldn't).
Here is an outline of the optimization:
- Check for and omit
tags in "simple" lists: list items
contain either a single paragraph, a nested simple list, or a
paragraph followed by a nested simple list. This means that
this list can be compact:
- Item 1.
- Item 2.
But this list cannot be compact:
- Item 1.
This second paragraph forces space between list items.
- Item 2.
- In non-list contexts, omit
tags on a paragraph if that
paragraph is the only child of its parent (footnotes & citations
are allowed a label first).
- Regardless of the above, in definitions, table cells, field bodies,
option descriptions, and list items, mark the first child with
'class="first"' and the last child with 'class="last"'. The stylesheet
sets the margins (top & bottom respectively) to 0 for these elements.
The ``no_compact_lists`` setting (``--no-compact-lists`` command-line
option) disables list whitespace optimization.
"""
xml_declaration = '\n'
doctype = ('\n')
head_prefix_template = ('\n
\n')
content_type = ('\n')
generator = ('\n')
stylesheet_link = '\n'
embedded_stylesheet = '\n'
named_tags = ['a', 'applet', 'form', 'frame', 'iframe', 'img', 'map']
words_and_spaces = re.compile(r'\S+| +|\n')
def __init__(self, document):
nodes.NodeVisitor.__init__(self, document)
self.settings = settings = document.settings
lcode = settings.language_code
self.language = languages.get_language(lcode)
self.meta = [self.content_type % settings.output_encoding,
self.generator % docutils.__version__]
self.head_prefix = []
self.html_prolog = []
if settings.xml_declaration:
self.head_prefix.append(self.xml_declaration
% settings.output_encoding)
# encoding not interpolated:
self.html_prolog.append(self.xml_declaration)
self.head_prefix.extend([self.doctype,
self.head_prefix_template % (lcode, lcode)])
self.html_prolog.append(self.doctype)
self.head = self.meta[:]
if settings.embed_stylesheet:
stylesheet = utils.get_stylesheet_reference(settings,
os.path.join(os.getcwd(), 'dummy'))
settings.record_dependencies.add(stylesheet)
stylesheet_text = open(stylesheet).read()
self.stylesheet = [self.embedded_stylesheet % stylesheet_text]
else:
stylesheet = utils.get_stylesheet_reference(settings)
if stylesheet:
self.stylesheet = [self.stylesheet_link
% self.encode(stylesheet)]
else:
self.stylesheet = []
self.body_prefix = ['\n\n']
# document title, subtitle display
self.body_pre_docinfo = []
# author, date, etc.
self.docinfo = []
self.body = []
self.fragment = []
self.body_suffix = ['\n\n']
self.section_level = 0
self.initial_header_level = int(settings.initial_header_level)
# A heterogenous stack used in conjunction with the tree traversal.
# Make sure that the pops correspond to the pushes:
self.context = []
self.topic_classes = []
self.colspecs = []
self.compact_p = 1
self.compact_simple = None
self.in_docinfo = None
self.in_sidebar = None
self.title = []
self.subtitle = []
self.header = []
self.footer = []
self.html_head = [self.content_type] # charset not interpolated
self.html_title = []
self.html_subtitle = []
self.html_body = []
self.in_document_title = 0
self.in_mailto = 0
def astext(self):
return ''.join(self.head_prefix + self.head
+ self.stylesheet + self.body_prefix
+ self.body_pre_docinfo + self.docinfo
+ self.body + self.body_suffix)
def encode(self, text):
"""Encode special characters in `text` & return."""
# @@@ A codec to do these and all other HTML entities would be nice.
text = text.replace("&", "&")
text = text.replace("<", "<")
text = text.replace('"', """)
text = text.replace(">", ">")
text = text.replace("@", "@") # may thwart some address harvesters
# Replace the non-breaking space character with the HTML entity:
text = text.replace(u'\u00a0', " ")
return text
def cloak_mailto(self, uri):
"""Try to hide a mailto: URL from harvesters."""
addr = uri.split(':', 1)[1]
if '?' in addr:
addr, query = addr.split('?', 1)
query = '?' + query
else:
query = ''
escaped = ['%%%02X' % ord(c) for c in addr]
return 'mailto:%s%s' % (''.join(escaped), query)
def cloak_email(self, addr):
return addr.replace('@', ' at ').replace('.', ' dot ')
def attval(self, text,
whitespace=re.compile('[\n\r\t\v\f]')):
"""Cleanse, HTML encode, and return attribute value text."""
return self.encode(whitespace.sub(' ', text))
def starttag(self, node, tagname, suffix='\n', infix='', **attributes):
"""
Construct and return a start tag given a node (id & class attributes
are extracted), tag name, and optional attributes.
"""
tagname = tagname.lower()
prefix = []
atts = {}
for (name, value) in attributes.items():
atts[name.lower()] = value
classes = node.get('classes', [])
if atts.has_key('class'):
classes.append(atts['class'])
if classes:
atts['class'] = ' '.join(classes)
assert not atts.has_key('id')
if node.get('ids'):
atts['id'] = node['ids'][0]
for id in node['ids'][1:]:
prefix.append('' % id)
if atts.has_key('id') and tagname in self.named_tags:
atts['name'] = atts['id'] # for compatibility with old browsers
attlist = atts.items()
attlist.sort()
parts = [tagname]
for name, value in attlist:
# value=None was used for boolean attributes without
# value, but this isn't supported by XHTML.
assert value is not None
if isinstance(value, ListType):
values = [unicode(v) for v in value]
parts.append('%s="%s"' % (name.lower(),
self.attval(' '.join(values))))
else:
try:
uval = unicode(value)
except TypeError: # for Python 2.1 compatibility:
uval = unicode(str(value))
parts.append('%s="%s"' % (name.lower(), self.attval(uval)))
return ''.join(prefix) + '<%s%s>' % (' '.join(parts), infix) + suffix
def emptytag(self, node, tagname, suffix='\n', **attributes):
"""Construct and return an XML-compatible empty tag."""
return self.starttag(node, tagname, suffix, infix=' /', **attributes)
def set_first_last(self, node):
children = [n for n in node if not isinstance(n, nodes.Invisible)]
if children:
children[0]['classes'].append('first')
children[-1]['classes'].append('last')
def visit_Text(self, node):
text = node.astext()
if self.in_mailto and self.settings.cloak_email_addresses:
text = self.cloak_email(text)
self.body.append(self.encode(text))
def depart_Text(self, node):
pass
def visit_abbreviation(self, node):
# @@@ implementation incomplete ("title" attribute)
self.body.append(self.starttag(node, 'abbr', ''))
def depart_abbreviation(self, node):
self.body.append('')
def visit_acronym(self, node):
# @@@ implementation incomplete ("title" attribute)
self.body.append(self.starttag(node, 'acronym', ''))
def depart_acronym(self, node):
self.body.append('')
def visit_address(self, node):
self.visit_docinfo_item(node, 'address', meta=None)
self.body.append(self.starttag(node, 'pre', CLASS='address'))
def depart_address(self, node):
self.body.append('\n\n')
self.depart_docinfo_item()
def visit_admonition(self, node, name=''):
self.body.append(self.starttag(node, 'div',
CLASS=(name or 'admonition')))
if name:
node.insert(0, nodes.title(name, self.language.labels[name]))
self.set_first_last(node)
def depart_admonition(self, node=None):
self.body.append('\n')
def visit_attention(self, node):
self.visit_admonition(node, 'attention')
def depart_attention(self, node):
self.depart_admonition()
attribution_formats = {'dash': ('—', ''),
'parentheses': ('(', ')'),
'parens': ('(', ')'),
'none': ('', '')}
def visit_attribution(self, node):
prefix, suffix = self.attribution_formats[self.settings.attribution]
self.context.append(suffix)
self.body.append(
self.starttag(node, 'p', prefix, CLASS='attribution'))
def depart_attribution(self, node):
self.body.append(self.context.pop() + '
tags around paragraph ``node`` can be omitted.
"""
if (isinstance(node.parent, nodes.document) or
isinstance(node.parent, nodes.compound)):
# Never compact paragraphs in document or compound.
return 0
for key, value in node.attlist():
if (node.is_not_default(key) and
not (key == 'classes' and value in
([], ['first'], ['last'], ['first', 'last']))):
# Attribute which needs to survive.
return 0
if (self.compact_simple or
self.compact_p and (len(node.parent) == 1 or
len(node.parent) == 2 and
isinstance(node.parent[0], nodes.label))):
return 1
return 0
def visit_paragraph(self, node):
if self.should_be_compact_paragraph(node):
self.context.append('')
else:
self.body.append(self.starttag(node, 'p', ''))
self.context.append('