""" sphinx.builders._epub_base ~~~~~~~~~~~~~~~~~~~~~~~~~~ Base class of epub2/epub3 builders. :copyright: Copyright 2007-2020 by the Sphinx team, see AUTHORS. :license: BSD, see LICENSE for details. """ import html import os import re import warnings from collections import namedtuple from os import path from typing import Any, Dict, List, Set, Tuple from zipfile import ZIP_DEFLATED, ZIP_STORED, ZipFile from docutils import nodes from docutils.nodes import Element, Node from docutils.utils import smartquotes from sphinx import addnodes from sphinx.builders.html import BuildInfo, StandaloneHTMLBuilder from sphinx.deprecation import RemovedInSphinx40Warning from sphinx.locale import __ from sphinx.util import logging from sphinx.util import status_iterator from sphinx.util.fileutil import copy_asset_file from sphinx.util.i18n import format_date from sphinx.util.osutil import ensuredir, copyfile try: from PIL import Image except ImportError: Image = None logger = logging.getLogger(__name__) # (Fragment) templates from which the metainfo files content.opf and # toc.ncx are created. # This template section also defines strings that are embedded in the html # output but that may be customized by (re-)setting module attributes, # e.g. from conf.py. COVERPAGE_NAME = 'epub-cover.xhtml' TOCTREE_TEMPLATE = 'toctree-l%d' LINK_TARGET_TEMPLATE = ' [%(uri)s]' FOOTNOTE_LABEL_TEMPLATE = '#%d' FOOTNOTES_RUBRIC_NAME = 'Footnotes' CSS_LINK_TARGET_CLASS = 'link-target' # XXX These strings should be localized according to epub_language GUIDE_TITLES = { 'toc': 'Table of Contents', 'cover': 'Cover' } MEDIA_TYPES = { '.xhtml': 'application/xhtml+xml', '.css': 'text/css', '.png': 'image/png', '.gif': 'image/gif', '.svg': 'image/svg+xml', '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.otf': 'application/x-font-otf', '.ttf': 'application/x-font-ttf', '.woff': 'application/font-woff', } VECTOR_GRAPHICS_EXTENSIONS = ('.svg',) # Regular expression to match colons only in local fragment identifiers. # If the URI contains a colon before the #, # it is an external link that should not change. REFURI_RE = re.compile("([^#:]*#)(.*)") ManifestItem = namedtuple('ManifestItem', ['href', 'id', 'media_type']) Spine = namedtuple('Spine', ['idref', 'linear']) Guide = namedtuple('Guide', ['type', 'title', 'uri']) NavPoint = namedtuple('NavPoint', ['navpoint', 'playorder', 'text', 'refuri', 'children']) def sphinx_smarty_pants(t: str, language: str = 'en') -> str: t = t.replace('"', '"') t = smartquotes.educateDashesOldSchool(t) t = smartquotes.educateQuotes(t, language) t = t.replace('"', '"') return t ssp = sphinx_smarty_pants # The epub publisher class EpubBuilder(StandaloneHTMLBuilder): """ Builder that outputs epub files. It creates the metainfo files container.opf, toc.ncx, mimetype, and META-INF/container.xml. Afterwards, all necessary files are zipped to an epub file. """ # don't copy the reST source copysource = False supported_image_types = ['image/svg+xml', 'image/png', 'image/gif', 'image/jpeg'] supported_remote_images = False # don't add links add_permalinks = False # don't use # as current path. ePub check reject it. allow_sharp_as_current_path = False # don't add sidebar etc. embedded = True # disable download role download_support = False # dont' create links to original images from images html_scaled_image_link = False # don't generate search index or include search page search = False coverpage_name = COVERPAGE_NAME toctree_template = TOCTREE_TEMPLATE link_target_template = LINK_TARGET_TEMPLATE css_link_target_class = CSS_LINK_TARGET_CLASS guide_titles = GUIDE_TITLES media_types = MEDIA_TYPES refuri_re = REFURI_RE template_dir = "" doctype = "" def init(self) -> None: super().init() # the output files for epub must be .html only self.out_suffix = '.xhtml' self.link_suffix = '.xhtml' self.playorder = 0 self.tocid = 0 self.id_cache = {} # type: Dict[str, str] self.use_index = self.get_builder_config('use_index', 'epub') self.refnodes = [] # type: List[Dict[str, Any]] def create_build_info(self) -> BuildInfo: return BuildInfo(self.config, self.tags, ['html', 'epub']) def get_theme_config(self) -> Tuple[str, Dict]: return self.config.epub_theme, self.config.epub_theme_options # generic support functions def make_id(self, name: str) -> str: # id_cache is intentionally mutable """Return a unique id for name.""" id = self.id_cache.get(name) if not id: id = 'epub-%d' % self.env.new_serialno('epub') self.id_cache[name] = id return id def esc(self, name: str) -> str: """Replace all characters not allowed in text an attribute values.""" warnings.warn( '%s.esc() is deprecated. Use html.escape() instead.' % self.__class__.__name__, RemovedInSphinx40Warning, stacklevel=2) name = name.replace('&', '&') name = name.replace('<', '<') name = name.replace('>', '>') name = name.replace('"', '"') name = name.replace('\'', ''') return name def get_refnodes(self, doctree: Node, result: List[Dict[str, Any]]) -> List[Dict[str, Any]]: # NOQA """Collect section titles, their depth in the toc and the refuri.""" # XXX: is there a better way than checking the attribute # toctree-l[1-8] on the parent node? if isinstance(doctree, nodes.reference) and doctree.get('refuri'): refuri = doctree['refuri'] if refuri.startswith('http://') or refuri.startswith('https://') \ or refuri.startswith('irc:') or refuri.startswith('mailto:'): return result classes = doctree.parent.attributes['classes'] for level in range(8, 0, -1): # or range(1, 8)? if (self.toctree_template % level) in classes: result.append({ 'level': level, 'refuri': html.escape(refuri), 'text': ssp(html.escape(doctree.astext())) }) break elif isinstance(doctree, nodes.Element): for elem in doctree: result = self.get_refnodes(elem, result) return result def check_refnodes(self, nodes: List[Dict[str, Any]]) -> None: appeared = set() # type: Set[str] for node in nodes: if node['refuri'] in appeared: logger.warning(__('duplicated ToC entry found: %s'), node['refuri']) else: appeared.add(node['refuri']) def get_toc(self) -> None: """Get the total table of contents, containing the master_doc and pre and post files not managed by sphinx. """ doctree = self.env.get_and_resolve_doctree(self.config.master_doc, self, prune_toctrees=False, includehidden=True) self.refnodes = self.get_refnodes(doctree, []) master_dir = path.dirname(self.config.master_doc) if master_dir: master_dir += '/' # XXX or os.sep? for item in self.refnodes: item['refuri'] = master_dir + item['refuri'] self.toc_add_files(self.refnodes) def toc_add_files(self, refnodes: List[Dict[str, Any]]) -> None: """Add the master_doc, pre and post files to a list of refnodes. """ refnodes.insert(0, { 'level': 1, 'refuri': html.escape(self.config.master_doc + self.out_suffix), 'text': ssp(html.escape( self.env.titles[self.config.master_doc].astext())) }) for file, text in reversed(self.config.epub_pre_files): refnodes.insert(0, { 'level': 1, 'refuri': html.escape(file), 'text': ssp(html.escape(text)) }) for file, text in self.config.epub_post_files: refnodes.append({ 'level': 1, 'refuri': html.escape(file), 'text': ssp(html.escape(text)) }) def fix_fragment(self, prefix: str, fragment: str) -> str: """Return a href/id attribute with colons replaced by hyphens.""" return prefix + fragment.replace(':', '-') def fix_ids(self, tree: nodes.document) -> None: """Replace colons with hyphens in href and id attributes. Some readers crash because they interpret the part as a transport protocol specification. """ def update_node_id(node: Element) -> None: """Update IDs of given *node*.""" new_ids = [] for node_id in node['ids']: new_id = self.fix_fragment('', node_id) if new_id not in new_ids: new_ids.append(new_id) node['ids'] = new_ids for reference in tree.traverse(nodes.reference): if 'refuri' in reference: m = self.refuri_re.match(reference['refuri']) if m: reference['refuri'] = self.fix_fragment(m.group(1), m.group(2)) if 'refid' in reference: reference['refid'] = self.fix_fragment('', reference['refid']) for target in tree.traverse(nodes.target): update_node_id(target) next_node = target.next_node(ascend=True) # type: Node if isinstance(next_node, nodes.Element): update_node_id(next_node) for desc_signature in tree.traverse(addnodes.desc_signature): update_node_id(desc_signature) def add_visible_links(self, tree: nodes.document, show_urls: str = 'inline') -> None: """Add visible link targets for external links""" def make_footnote_ref(doc: nodes.document, label: str) -> nodes.footnote_reference: """Create a footnote_reference node with children""" footnote_ref = nodes.footnote_reference('[#]_') footnote_ref.append(nodes.Text(label)) doc.note_autofootnote_ref(footnote_ref) return footnote_ref def make_footnote(doc: nodes.document, label: str, uri: str) -> nodes.footnote: """Create a footnote node with children""" footnote = nodes.footnote(uri) para = nodes.paragraph() para.append(nodes.Text(uri)) footnote.append(para) footnote.insert(0, nodes.label('', label)) doc.note_autofootnote(footnote) return footnote def footnote_spot(tree: nodes.document) -> Tuple[Element, int]: """Find or create a spot to place footnotes. The function returns the tuple (parent, index).""" # The code uses the following heuristic: # a) place them after the last existing footnote # b) place them after an (empty) Footnotes rubric # c) create an empty Footnotes rubric at the end of the document fns = tree.traverse(nodes.footnote) if fns: fn = fns[-1] return fn.parent, fn.parent.index(fn) + 1 for node in tree.traverse(nodes.rubric): if len(node) == 1 and node.astext() == FOOTNOTES_RUBRIC_NAME: return node.parent, node.parent.index(node) + 1 doc = tree.traverse(nodes.document)[0] rub = nodes.rubric() rub.append(nodes.Text(FOOTNOTES_RUBRIC_NAME)) doc.append(rub) return doc, doc.index(rub) + 1 if show_urls == 'no': return if show_urls == 'footnote': doc = tree.traverse(nodes.document)[0] fn_spot, fn_idx = footnote_spot(tree) nr = 1 for node in tree.traverse(nodes.reference): uri = node.get('refuri', '') if (uri.startswith('http:') or uri.startswith('https:') or uri.startswith('ftp:')) and uri not in node.astext(): idx = node.parent.index(node) + 1 if show_urls == 'inline': uri = self.link_target_template % {'uri': uri} link = nodes.inline(uri, uri) link['classes'].append(self.css_link_target_class) node.parent.insert(idx, link) elif show_urls == 'footnote': label = FOOTNOTE_LABEL_TEMPLATE % nr nr += 1 footnote_ref = make_footnote_ref(doc, label) node.parent.insert(idx, footnote_ref) footnote = make_footnote(doc, label, uri) fn_spot.insert(fn_idx, footnote) footnote_ref['refid'] = footnote['ids'][0] footnote.add_backref(footnote_ref['ids'][0]) fn_idx += 1 def write_doc(self, docname: str, doctree: nodes.document) -> None: """Write one document file. This method is overwritten in order to fix fragment identifiers and to add visible external links. """ self.fix_ids(doctree) self.add_visible_links(doctree, self.config.epub_show_urls) super().write_doc(docname, doctree) def fix_genindex(self, tree: List[Tuple[str, List[Tuple[str, Any]]]]) -> None: """Fix href attributes for genindex pages.""" # XXX: modifies tree inline # Logic modeled from themes/basic/genindex.html for key, columns in tree: for entryname, (links, subitems, key_) in columns: for (i, (ismain, link)) in enumerate(links): m = self.refuri_re.match(link) if m: links[i] = (ismain, self.fix_fragment(m.group(1), m.group(2))) for subentryname, subentrylinks in subitems: for (i, (ismain, link)) in enumerate(subentrylinks): m = self.refuri_re.match(link) if m: subentrylinks[i] = (ismain, self.fix_fragment(m.group(1), m.group(2))) def is_vector_graphics(self, filename: str) -> bool: """Does the filename extension indicate a vector graphic format?""" ext = path.splitext(filename)[-1] return ext in VECTOR_GRAPHICS_EXTENSIONS def copy_image_files_pil(self) -> None: """Copy images using Pillow, the Python Imaging Libary. The method tries to read and write the files with Pillow, converting the format and resizing the image if necessary/possible. """ ensuredir(path.join(self.outdir, self.imagedir)) for src in status_iterator(self.images, __('copying images... '), "brown", len(self.images), self.app.verbosity): dest = self.images[src] try: img = Image.open(path.join(self.srcdir, src)) except OSError: if not self.is_vector_graphics(src): logger.warning(__('cannot read image file %r: copying it instead'), path.join(self.srcdir, src)) try: copyfile(path.join(self.srcdir, src), path.join(self.outdir, self.imagedir, dest)) except OSError as err: logger.warning(__('cannot copy image file %r: %s'), path.join(self.srcdir, src), err) continue if self.config.epub_fix_images: if img.mode in ('P',): # See the Pillow documentation for Image.convert() # https://pillow.readthedocs.io/en/stable/reference/Image.html#PIL.Image.Image.convert img = img.convert() if self.config.epub_max_image_width > 0: (width, height) = img.size nw = self.config.epub_max_image_width if width > nw: nh = (height * nw) / width img = img.resize((nw, nh), Image.BICUBIC) try: img.save(path.join(self.outdir, self.imagedir, dest)) except OSError as err: logger.warning(__('cannot write image file %r: %s'), path.join(self.srcdir, src), err) def copy_image_files(self) -> None: """Copy image files to destination directory. This overwritten method can use Pillow to convert image files. """ if self.images: if self.config.epub_fix_images or self.config.epub_max_image_width: if not Image: logger.warning(__('Pillow not found - copying image files')) super().copy_image_files() else: self.copy_image_files_pil() else: super().copy_image_files() def copy_download_files(self) -> None: pass def handle_page(self, pagename: str, addctx: Dict, templatename: str = 'page.html', outfilename: str = None, event_arg: Any = None) -> None: """Create a rendered page. This method is overwritten for genindex pages in order to fix href link attributes. """ if pagename.startswith('genindex') and 'genindexentries' in addctx: if not self.use_index: return self.fix_genindex(addctx['genindexentries']) addctx['doctype'] = self.doctype super().handle_page(pagename, addctx, templatename, outfilename, event_arg) def build_mimetype(self, outdir: str = None, outname: str = 'mimetype') -> None: """Write the metainfo file mimetype.""" if outdir: warnings.warn('The arguments of EpubBuilder.build_mimetype() is deprecated.', RemovedInSphinx40Warning, stacklevel=2) else: outdir = self.outdir logger.info(__('writing %s file...'), outname) copy_asset_file(path.join(self.template_dir, 'mimetype'), path.join(outdir, outname)) def build_container(self, outdir: str = None, outname: str = 'META-INF/container.xml') -> None: # NOQA """Write the metainfo file META-INF/container.xml.""" if outdir: warnings.warn('The arguments of EpubBuilder.build_container() is deprecated.', RemovedInSphinx40Warning, stacklevel=2) else: outdir = self.outdir logger.info(__('writing %s file...'), outname) filename = path.join(outdir, outname) ensuredir(path.dirname(filename)) copy_asset_file(path.join(self.template_dir, 'container.xml'), filename) def content_metadata(self) -> Dict[str, Any]: """Create a dictionary with all metadata for the content.opf file properly escaped. """ metadata = {} # type: Dict[str, Any] metadata['title'] = html.escape(self.config.epub_title) metadata['author'] = html.escape(self.config.epub_author) metadata['uid'] = html.escape(self.config.epub_uid) metadata['lang'] = html.escape(self.config.epub_language) metadata['publisher'] = html.escape(self.config.epub_publisher) metadata['copyright'] = html.escape(self.config.epub_copyright) metadata['scheme'] = html.escape(self.config.epub_scheme) metadata['id'] = html.escape(self.config.epub_identifier) metadata['date'] = html.escape(format_date("%Y-%m-%d")) metadata['manifest_items'] = [] metadata['spines'] = [] metadata['guides'] = [] return metadata def build_content(self, outdir: str = None, outname: str = 'content.opf') -> None: """Write the metainfo file content.opf It contains bibliographic data, a file list and the spine (the reading order). """ if outdir: warnings.warn('The arguments of EpubBuilder.build_content() is deprecated.', RemovedInSphinx40Warning, stacklevel=2) else: outdir = self.outdir logger.info(__('writing %s file...'), outname) metadata = self.content_metadata() # files if not outdir.endswith(os.sep): outdir += os.sep olen = len(outdir) self.files = [] # type: List[str] self.ignored_files = ['.buildinfo', 'mimetype', 'content.opf', 'toc.ncx', 'META-INF/container.xml', 'Thumbs.db', 'ehthumbs.db', '.DS_Store', 'nav.xhtml', self.config.epub_basename + '.epub'] + \ self.config.epub_exclude_files if not self.use_index: self.ignored_files.append('genindex' + self.out_suffix) for root, dirs, files in os.walk(outdir): dirs.sort() for fn in sorted(files): filename = path.join(root, fn)[olen:] if filename in self.ignored_files: continue ext = path.splitext(filename)[-1] if ext not in self.media_types: # we always have JS and potentially OpenSearch files, don't # always warn about them if ext not in ('.js', '.xml'): logger.warning(__('unknown mimetype for %s, ignoring'), filename, type='epub', subtype='unknown_project_files') continue filename = filename.replace(os.sep, '/') item = ManifestItem(html.escape(filename), html.escape(self.make_id(filename)), html.escape(self.media_types[ext])) metadata['manifest_items'].append(item) self.files.append(filename) # spine spinefiles = set() for refnode in self.refnodes: if '#' in refnode['refuri']: continue if refnode['refuri'] in self.ignored_files: continue spine = Spine(html.escape(self.make_id(refnode['refuri'])), True) metadata['spines'].append(spine) spinefiles.add(refnode['refuri']) for info in self.domain_indices: spine = Spine(html.escape(self.make_id(info[0] + self.out_suffix)), True) metadata['spines'].append(spine) spinefiles.add(info[0] + self.out_suffix) if self.use_index: spine = Spine(html.escape(self.make_id('genindex' + self.out_suffix)), True) metadata['spines'].append(spine) spinefiles.add('genindex' + self.out_suffix) # add auto generated files for name in self.files: if name not in spinefiles and name.endswith(self.out_suffix): spine = Spine(html.escape(self.make_id(name)), False) metadata['spines'].append(spine) # add the optional cover html_tmpl = None if self.config.epub_cover: image, html_tmpl = self.config.epub_cover image = image.replace(os.sep, '/') metadata['cover'] = html.escape(self.make_id(image)) if html_tmpl: spine = Spine(html.escape(self.make_id(self.coverpage_name)), True) metadata['spines'].insert(0, spine) if self.coverpage_name not in self.files: ext = path.splitext(self.coverpage_name)[-1] self.files.append(self.coverpage_name) item = ManifestItem(html.escape(self.coverpage_name), html.escape(self.make_id(self.coverpage_name)), html.escape(self.media_types[ext])) metadata['manifest_items'].append(item) ctx = {'image': html.escape(image), 'title': self.config.project} self.handle_page( path.splitext(self.coverpage_name)[0], ctx, html_tmpl) spinefiles.add(self.coverpage_name) auto_add_cover = True auto_add_toc = True if self.config.epub_guide: for type, uri, title in self.config.epub_guide: file = uri.split('#')[0] if file not in self.files: self.files.append(file) if type == 'cover': auto_add_cover = False if type == 'toc': auto_add_toc = False metadata['guides'].append(Guide(html.escape(type), html.escape(title), html.escape(uri))) if auto_add_cover and html_tmpl: metadata['guides'].append(Guide('cover', self.guide_titles['cover'], html.escape(self.coverpage_name))) if auto_add_toc and self.refnodes: metadata['guides'].append(Guide('toc', self.guide_titles['toc'], html.escape(self.refnodes[0]['refuri']))) # write the project file copy_asset_file(path.join(self.template_dir, 'content.opf_t'), path.join(outdir, outname), metadata) def new_navpoint(self, node: Dict[str, Any], level: int, incr: bool = True) -> NavPoint: """Create a new entry in the toc from the node at given level.""" # XXX Modifies the node if incr: self.playorder += 1 self.tocid += 1 return NavPoint('navPoint%d' % self.tocid, self.playorder, node['text'], node['refuri'], []) def build_navpoints(self, nodes: List[Dict[str, Any]]) -> List[NavPoint]: """Create the toc navigation structure. Subelements of a node are nested inside the navpoint. For nested nodes the parent node is reinserted in the subnav. """ navstack = [] # type: List[NavPoint] navstack.append(NavPoint('dummy', '', '', '', [])) level = 0 lastnode = None for node in nodes: if not node['text']: continue file = node['refuri'].split('#')[0] if file in self.ignored_files: continue if node['level'] > self.config.epub_tocdepth: continue if node['level'] == level: navpoint = self.new_navpoint(node, level) navstack.pop() navstack[-1].children.append(navpoint) navstack.append(navpoint) elif node['level'] == level + 1: level += 1 if lastnode and self.config.epub_tocdup: # Insert starting point in subtoc with same playOrder navstack[-1].children.append(self.new_navpoint(lastnode, level, False)) navpoint = self.new_navpoint(node, level) navstack[-1].children.append(navpoint) navstack.append(navpoint) elif node['level'] < level: while node['level'] < len(navstack): navstack.pop() level = node['level'] navpoint = self.new_navpoint(node, level) navstack[-1].children.append(navpoint) navstack.append(navpoint) else: raise lastnode = node return navstack[0].children def toc_metadata(self, level: int, navpoints: List[NavPoint]) -> Dict[str, Any]: """Create a dictionary with all metadata for the toc.ncx file properly escaped. """ metadata = {} # type: Dict[str, Any] metadata['uid'] = self.config.epub_uid metadata['title'] = html.escape(self.config.epub_title) metadata['level'] = level metadata['navpoints'] = navpoints return metadata def build_toc(self, outdir: str = None, outname: str = 'toc.ncx') -> None: """Write the metainfo file toc.ncx.""" if outdir: warnings.warn('The arguments of EpubBuilder.build_toc() is deprecated.', RemovedInSphinx40Warning, stacklevel=2) else: outdir = self.outdir logger.info(__('writing %s file...'), outname) if self.config.epub_tocscope == 'default': doctree = self.env.get_and_resolve_doctree(self.config.master_doc, self, prune_toctrees=False, includehidden=False) refnodes = self.get_refnodes(doctree, []) self.toc_add_files(refnodes) else: # 'includehidden' refnodes = self.refnodes self.check_refnodes(refnodes) navpoints = self.build_navpoints(refnodes) level = max(item['level'] for item in self.refnodes) level = min(level, self.config.epub_tocdepth) copy_asset_file(path.join(self.template_dir, 'toc.ncx_t'), path.join(outdir, outname), self.toc_metadata(level, navpoints)) def build_epub(self, outdir: str = None, outname: str = None) -> None: """Write the epub file. It is a zip file with the mimetype file stored uncompressed as the first entry. """ if outdir: warnings.warn('The arguments of EpubBuilder.build_epub() is deprecated.', RemovedInSphinx40Warning, stacklevel=2) else: outdir = self.outdir outname = self.config.epub_basename + '.epub' logger.info(__('writing %s file...'), outname) epub_filename = path.join(outdir, outname) with ZipFile(epub_filename, 'w', ZIP_DEFLATED) as epub: epub.write(path.join(outdir, 'mimetype'), 'mimetype', ZIP_STORED) for filename in ['META-INF/container.xml', 'content.opf', 'toc.ncx']: epub.write(path.join(outdir, filename), filename, ZIP_DEFLATED) for filename in self.files: epub.write(path.join(outdir, filename), filename, ZIP_DEFLATED)