swh:1:snp:4e3e7077647a709f15b8c1b32ce7100175d0580b
Raw File
Tip revision: 6c0251f7530fe52c2a9eb63f76de7041643f18f0 authored by Jean Kossaifi on 16 December 2019, 10:32:24 UTC
Merge pull request #146 from yngvem/master
Tip revision: 6c0251f
docs_resolv.py
# -*- coding: utf-8 -*-
# Author: Óscar Nájera
# License: 3-clause BSD
"""
Link resolver objects
=====================
"""
from __future__ import print_function
import codecs
import gzip
import os
import posixpath
import re
import shelve
import sys

# Try Python 2 first, otherwise load from Python 3
try:
    import cPickle as pickle
except ImportError:
    import pickle
try:
    import urllib2 as urllib_request
    from urllib2 import HTTPError, URLError
    import urlparse as urllib_parse
except ImportError:
    import urllib.request as urllib_request
    import urllib.parse as urllib_parse
    from urllib.error import HTTPError, URLError

from io import StringIO

from . import sphinx_compatibility


logger = sphinx_compatibility.getLogger('sphinx-gallery')


def _get_data(url):
    """Helper function to get data over http(s) or from a local file"""
    if urllib_parse.urlparse(url).scheme in ('http', 'https'):
        resp = urllib_request.urlopen(url)
        encoding = resp.headers.get('content-encoding', 'plain')
        data = resp.read()
        if encoding == 'plain':
            data = data.decode('utf-8')
        elif encoding == 'gzip':
            data = StringIO(data)
            data = gzip.GzipFile(fileobj=data).read().decode('utf-8')
        else:
            raise RuntimeError('unknown encoding')
    else:
        with codecs.open(url, mode='r', encoding='utf-8') as fid:
            data = fid.read()

    return data


def get_data(url, gallery_dir):
    """Persistent dictionary usage to retrieve the search indexes"""

    # shelve keys need to be str in python 2
    if sys.version_info[0] == 2 and isinstance(url, unicode):
        url = url.encode('utf-8')

    cached_file = os.path.join(gallery_dir, 'searchindex')
    search_index = shelve.open(cached_file)
    if url in search_index:
        data = search_index[url]
    else:
        data = _get_data(url)
        search_index[url] = data
    search_index.close()

    return data


def _select_block(str_in, start_tag, end_tag):
    """Select first block delimited by start_tag and end_tag"""
    start_pos = str_in.find(start_tag)
    if start_pos < 0:
        raise ValueError('start_tag not found')
    depth = 0
    for pos in range(start_pos, len(str_in)):
        if str_in[pos] == start_tag:
            depth += 1
        elif str_in[pos] == end_tag:
            depth -= 1

        if depth == 0:
            break
    sel = str_in[start_pos + 1:pos]
    return sel


def _parse_dict_recursive(dict_str):
    """Parse a dictionary from the search index"""
    dict_out = dict()
    pos_last = 0
    pos = dict_str.find(':')
    while pos >= 0:
        key = dict_str[pos_last:pos]
        if dict_str[pos + 1] == '[':
            # value is a list
            pos_tmp = dict_str.find(']', pos + 1)
            if pos_tmp < 0:
                raise RuntimeError('error when parsing dict')
            value = dict_str[pos + 2: pos_tmp].split(',')
            # try to convert elements to int
            for i in range(len(value)):
                try:
                    value[i] = int(value[i])
                except ValueError:
                    pass
        elif dict_str[pos + 1] == '{':
            # value is another dictionary
            subdict_str = _select_block(dict_str[pos:], '{', '}')
            value = _parse_dict_recursive(subdict_str)
            pos_tmp = pos + len(subdict_str)
        else:
            raise ValueError('error when parsing dict: unknown elem')

        key = key.strip('"')
        if len(key) > 0:
            dict_out[key] = value

        pos_last = dict_str.find(',', pos_tmp)
        if pos_last < 0:
            break
        pos_last += 1
        pos = dict_str.find(':', pos_last)

    return dict_out


def parse_sphinx_searchindex(searchindex):
    """Parse a Sphinx search index

    Parameters
    ----------
    searchindex : str
        The Sphinx search index (contents of searchindex.js)

    Returns
    -------
    filenames : list of str
        The file names parsed from the search index.
    objects : dict
        The objects parsed from the search index.
    """
    # Make sure searchindex uses UTF-8 encoding
    if hasattr(searchindex, 'decode'):
        searchindex = searchindex.decode('UTF-8')

    # parse objects
    query = 'objects:'
    pos = searchindex.find(query)
    if pos < 0:
        raise ValueError('"objects:" not found in search index')

    sel = _select_block(searchindex[pos:], '{', '}')
    objects = _parse_dict_recursive(sel)

    # parse filenames
    query = 'filenames:'
    pos = searchindex.find(query)
    if pos < 0:
        raise ValueError('"filenames:" not found in search index')
    filenames = searchindex[pos + len(query) + 1:]
    filenames = filenames[:filenames.find(']')]
    filenames = [f.strip('"') for f in filenames.split(',')]

    return filenames, objects


class SphinxDocLinkResolver(object):
    """ Resolve documentation links using searchindex.js generated by Sphinx

    Parameters
    ----------
    doc_url : str
        The base URL of the project website.
    searchindex : str
        Filename of searchindex, relative to doc_url.
    extra_modules_test : list of str
        List of extra module names to test.
    relative : bool
        Return relative links (only useful for links to documentation of this
        package).
    """

    def __init__(self, doc_url, gallery_dir, searchindex='searchindex.js',
                 extra_modules_test=None, relative=False):
        self.doc_url = doc_url
        self.gallery_dir = gallery_dir
        self.relative = relative
        self._link_cache = {}

        self.extra_modules_test = extra_modules_test
        self._page_cache = {}
        if doc_url.startswith(('http://', 'https://')):
            if relative:
                raise ValueError('Relative links are only supported for local '
                                 'URLs (doc_url cannot be absolute)')
            searchindex_url = doc_url + '/' + searchindex
        else:
            searchindex_url = os.path.join(doc_url, searchindex)

        # detect if we are using relative links on a Windows system
        if (os.name.lower() == 'nt' and
                not doc_url.startswith(('http://', 'https://'))):
            if not relative:
                raise ValueError('You have to use relative=True for the local'
                                 ' package on a Windows system.')
            self._is_windows = True
        else:
            self._is_windows = False

        # download and initialize the search index
        sindex = get_data(searchindex_url, gallery_dir)
        filenames, objects = parse_sphinx_searchindex(sindex)

        self._searchindex = dict(filenames=filenames, objects=objects)

    def _get_link(self, cobj):
        """Get a valid link, False if not found"""

        fname_idx = None
        full_name = cobj['module_short'] + '.' + cobj['name']
        if full_name in self._searchindex['objects']:
            value = self._searchindex['objects'][full_name]
            if isinstance(value, dict):
                value = value[next(iter(value.keys()))]
            fname_idx = value[0]
        elif cobj['module_short'] in self._searchindex['objects']:
            value = self._searchindex['objects'][cobj['module_short']]
            if cobj['name'] in value.keys():
                fname_idx = value[cobj['name']][0]

        if fname_idx is not None:
            fname = self._searchindex['filenames'][fname_idx]
            # In 1.5+ Sphinx seems to have changed from .rst.html to only
            # .html extension in converted files. But URLs could be
            # built with < 1.5 or >= 1.5 regardless of what we're currently
            # building with, so let's just check both :(
            fnames = [fname + '.html', os.path.splitext(fname)[0] + '.html']
            for i, fname in enumerate(fnames):
                try:
                    if self._is_windows:
                        fname = fname.replace('/', '\\')
                        link = os.path.join(self.doc_url, fname)
                    else:
                        link = posixpath.join(self.doc_url, fname)

                    if link in self._page_cache:
                        html = self._page_cache[link]
                    else:
                        html = get_data(link, self.gallery_dir)
                        self._page_cache[link] = html
                except (HTTPError, URLError, IOError):
                    if i == len(fnames) - 1:
                        raise
                else:
                    break

            # test if cobj appears in page
            comb_names = [cobj['module_short'] + '.' + cobj['name']]
            if self.extra_modules_test is not None:
                for mod in self.extra_modules_test:
                    comb_names.append(mod + '.' + cobj['name'])
            url = False

            for comb_name in comb_names:
                if comb_name in html:
                    url = link + u'#' + comb_name
            link = url
        else:
            link = False

        return link

    def resolve(self, cobj, this_url):
        """Resolve the link to the documentation, returns None if not found

        Parameters
        ----------
        cobj : dict
            Dict with information about the "code object" for which we are
            resolving a link.
            cobj['name'] : function or class name (str)
            cobj['module_short'] : shortened module name (str)
            cobj['module'] : module name (str)
        this_url: str
            URL of the current page. Needed to construct relative URLs
            (only used if relative=True in constructor).

        Returns
        -------
        link : str | None
            The link (URL) to the documentation.
        """
        full_name = cobj['module_short'] + '.' + cobj['name']
        link = self._link_cache.get(full_name, None)
        if link is None:
            # we don't have it cached
            link = self._get_link(cobj)
            # cache it for the future
            self._link_cache[full_name] = link

        if link is False or link is None:
            # failed to resolve
            return None

        if self.relative:
            link = os.path.relpath(link, start=this_url)
            if self._is_windows:
                # replace '\' with '/' so it on the web
                link = link.replace('\\', '/')

            # for some reason, the relative link goes one directory too high up
            link = link[3:]

        return link


def _embed_code_links(app, gallery_conf, gallery_dir):
    # Add resolvers for the packages for which we want to show links
    doc_resolvers = {}

    src_gallery_dir = os.path.join(app.builder.srcdir, gallery_dir)
    for this_module, url in gallery_conf['reference_url'].items():
        try:
            if url is None:
                doc_resolvers[this_module] = SphinxDocLinkResolver(
                    app.builder.outdir,
                    src_gallery_dir,
                    relative=True)
            else:
                doc_resolvers[this_module] = SphinxDocLinkResolver(url,
                                                                   src_gallery_dir)

        except HTTPError as e:
            logger.warning("The following HTTP Error has occurred: %d", e.code)
        except URLError as e:
            logger.warning(
                "Embedding the documentation hyperlinks requires Internet "
                "access.\nPlease check your network connection.\nUnable to "
                "continue embedding `%s` links due to a URL Error:\n%s",
                this_module,
                str(e.args))

    html_gallery_dir = os.path.abspath(os.path.join(app.builder.outdir,
                                                    gallery_dir))

    # patterns for replacement
    link_pattern = ('<a href="%s" title="View documentation for %s">%s</a>')
    orig_pattern = '<span class="n">%s</span>'
    period = '<span class="o">.</span>'

    # This could be turned into a generator if necessary, but should be okay
    flat = [[dirpath, filename]
            for dirpath, _, filenames in os.walk(html_gallery_dir)
            for filename in filenames]
    iterator = sphinx_compatibility.status_iterator(
        flat, gallery_dir, color='fuchsia',
        length=len(flat), stringify_func=lambda x: os.path.basename(x[1]))
    for dirpath, fname in iterator:
        full_fname = os.path.join(html_gallery_dir, dirpath, fname)
        subpath = dirpath[len(html_gallery_dir) + 1:]
        pickle_fname = os.path.join(src_gallery_dir, subpath,
                                    fname[:-5] + '_codeobj.pickle')

        if os.path.exists(pickle_fname):
            # we have a pickle file with the objects to embed links for
            with open(pickle_fname, 'rb') as fid:
                example_code_obj = pickle.load(fid)
            fid.close()
            str_repl = {}
            # generate replacement strings with the links
            for name, cobj in example_code_obj.items():
                this_module = cobj['module'].split('.')[0]

                if this_module not in doc_resolvers:
                    continue

                try:
                    link = doc_resolvers[this_module].resolve(cobj,
                                                              full_fname)
                except (HTTPError, URLError) as e:
                    if isinstance(e, HTTPError):
                        extra = e.code
                    else:
                        extra = e.reason
                    logger.warning("Error resolving %s.%s: %r (%s)",
                                   cobj['module'], cobj['name'], e, extra)
                    continue

                if link is not None:
                    parts = name.split('.')
                    name_html = period.join(orig_pattern % part
                                            for part in parts)
                    full_function_name = '%s.%s' % (
                        cobj['module'], cobj['name'])
                    str_repl[name_html] = link_pattern % (
                        link, full_function_name, name_html)
            # do the replacement in the html file

            # ensure greediness
            names = sorted(str_repl, key=len, reverse=True)
            regex_str = '|'.join(re.escape(name) for name in names)
            regex = re.compile(regex_str)

            def substitute_link(match):
                return str_repl[match.group()]

            if len(str_repl) > 0:
                with open(full_fname, 'rb') as fid:
                    lines_in = fid.readlines()
                with open(full_fname, 'wb') as fid:
                    for line in lines_in:
                        line = line.decode('utf-8')
                        line = regex.sub(substitute_link, line)
                        fid.write(line.encode('utf-8'))


def embed_code_links(app, exception):
    """Embed hyperlinks to documentation into example code"""
    if exception is not None:
        return

    # No need to waste time embedding hyperlinks when not running the examples
    # XXX: also at the time of writing this fixes make html-noplot
    # for some reason I don't fully understand
    if not app.builder.config.plot_gallery:
        return

    # XXX: Whitelist of builders for which it makes sense to embed
    # hyperlinks inside the example html. Note that the link embedding
    # require searchindex.js to exist for the links to the local doc
    # and there does not seem to be a good way of knowing which
    # builders creates a searchindex.js.
    if app.builder.name not in ['html', 'readthedocs']:
        return

    logger.info('Embedding documentation hyperlinks in examples ...',
                color='white')

    gallery_conf = app.config.sphinx_gallery_conf

    gallery_dirs = gallery_conf['gallery_dirs']
    if not isinstance(gallery_dirs, list):
        gallery_dirs = [gallery_dirs]

    for gallery_dir in gallery_dirs:
        _embed_code_links(app, gallery_conf, gallery_dir)
back to top