Source code for hdf5storage.pathesc

# Copyright (c) 2016-2020, Freja Nordsiek
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

""" Module for handling paths. """


import collections.abc
import pathlib
import posixpath
import re


# For escaping and unescaping unicode paths, we need compiled regular
# expressions to finding sequences of one or more dots, find slashes,
# and hex escapes. In addition, we need a dict to lookup the slash
# conversions. Compiling the regular expressions here at initialization
# will help performance by not having to compile new ones every time a
# path is processed.
_find_dots_re = re.compile('\\.+')
_find_invalid_escape_re = re.compile(
    '(^|[^\\\\])\\\\(\\\\\\\\)*($|[^xuU\\\\]'
    '|x[0-9a-fA-F]?($|[^0-9a-fA-F])'
    '|u[0-9a-fA-F]{0,3}($|[^0-9a-fA-F])'
    '|U[0-9a-fA-F]{0,7}($|[^0-9a-fA-F]))')
_find_fslashnull_re = re.compile('[\\\\/\x00]')
_find_escapes_re = re.compile(
    '\\\\+(x[0-9a-fA-F]{2}|u[0-9a-fA-F]{4}|U[0-9a-fA-F]{8})')
_char_escape_conversions = {'\x00': '\\x00',
                            '/': '\\x2f',
                            '\\': '\\\\'}


def _replace_fun_escape(m):
    """ Hex/unicode escape single characters found in regex matches.

    Supports single hex/unicode escapes of the form ``'\\xYY'``,
    ``'\\uYYYY'``, and ``'\\UYYYYYYYY'`` where Y is a hex digit and
    converting single backslashes to double backslashes.

    .. versionadded:: 0.2

    Parameters
    ----------
    m : regex match

    Returns
    -------
    s : str
        The hex excaped version of the character.

    Raises
    ------
    NotImplementedError
        If the character is not in the supported character code range.

    """
    c = m.group(0)
    # If it is one of the characters that we use a particular escape
    # for, return it.
    if c in _char_escape_conversions:
        return _char_escape_conversions[c]
    # We need to make the \\xYY, \\uYYYY, or \\UYYYYYYYY encoding. To do
    # that, we get the character code and do different things depending
    # on its size.
    value = ord(c)
    if value <= 0xFF:
        return '\\x{0:02x}'.format(value)
    elif value <= 0xFFFF:
        return '\\u{0:04x}'.format(value)
    elif value <= 0xFFFFFFFF:
        return '\\U{0:08x}'.format(value)
    else:
        raise NotImplementedError('Cannot escape a character whose '
                                  'code it outside of the range '
                                  '0 - 0xFFFFFFFF.')


def _replace_fun_unescape(m):
    """ Decode single hex/unicode escapes found in regex matches.

    Supports single hex/unicode escapes of the form ``'\\xYY'``,
    ``'\\uYYYY'``, and ``'\\UYYYYYYYY'`` where Y is a hex digit. Only
    decodes if there is an odd number of backslashes.

    .. versionadded:: 0.2

    Parameters
    ----------
    m : regex match

    Returns
    -------
    c : str
        The unescaped character.

    """
    slsh = b'\\'.decode('ascii')
    s = m.group(0)
    count = s.count(slsh)
    if count % 2 == 0:
        return s
    else:
        c = chr(int(s[(count + 1):], base=16))
        return slsh * (count - 1) + c


[docs]def escape_path(pth): """ Hex/unicode escapes a path. Escapes a path so that it can be represented faithfully in an HDF5 file without changing directories. This means that leading ``'.'`` must be escaped. ``'/'`` and null must be escaped to. Backslashes are escaped as double backslashes. Other escaped characters are replaced with ``'\\xYY'``, ``'\\uYYYY', or ``'\\UYYYYYYYY'`` where Y are hex digits depending on the unicode numerical value of the character. for ``'.'``, both slashes, and null; this will be the former (``'\\xYY'``). .. versionadded:: 0.2 Parameters ---------- pth : str or bytes The path to escape. Returns ------- epth : str The escaped path. Raises ------ TypeError If `pth` is not the right type. See Also -------- unescape_path """ if isinstance(pth, bytes): pth = pth.decode('utf-8') if not isinstance(pth, str): raise TypeError('pth must be str or bytes.') match = _find_dots_re.match(pth) if match is None: prefix = '' s = pth else: prefix = '\\x2e' * match.end() s = pth[match.end():] return prefix + _find_fslashnull_re.sub(_replace_fun_escape, s)
[docs]def unescape_path(pth): """ Hex/unicode unescapes a path. Unescapes a path. Valid escapeds are ``'\\xYY'``, ``'\\uYYYY', or ``'\\UYYYYYYYY'`` where Y are hex digits giving the character's unicode numerical value and double backslashes which are the escape for single backslashes. .. versionadded:: 0.2 Parameters ---------- pth : str or bytes The path to unescape. Returns ------- unpth : str The unescaped path. Raises ------ TypeError If `pth` is not the right type. ValueError If an invalid escape is found. See Also -------- escape_path """ if isinstance(pth, bytes): pth = pth.decode('utf-8') if not isinstance(pth, str): raise TypeError('pth must be str or bytes.') # Look for invalid escapes. if _find_invalid_escape_re.search(pth) is not None: raise ValueError('Invalid escape found.') # Do all hex/unicode escapes. s = _find_escapes_re.sub(_replace_fun_unescape, pth) # Do all double backslash escapes. return s.replace(b'\\\\'.decode('ascii'), b'\\'.decode('ascii'))
[docs]def process_path(pth): """ Processes paths. Processes the provided path and breaks it into it Group part (`groupname`) and target part (`targetname`). ``bytes`` paths are converted to ``str``. Separated paths are given as an iterable of ``str`` and ``bytes``. Each part of a separated path is escaped using ``escape_path``. Otherwise, the path is assumed to be already escaped. Escaping is done so that targets with a part that starts with one or more periods, contain slashes, and/or contain nulls can be used without causing the wrong Group to be looked in or the wrong target to be looked at. It essentially allows one to make a Dataset named ``'..'`` or ``'a/a'`` instead of moving around in the Dataset hierarchy. All paths are POSIX style. .. versionadded:: 0.2 Parameters ---------- pth : str or bytes or pathlib.PurePath or Iterable The POSIX style path as a ``str`` or ``bytes`` or the separated path in an Iterable with the elements being ``str``, ``bytes``, and ``pathlib.PurePath``. For separated paths, escaping will be done on each part. Returns ------- groupname : str The path to the Group containing the target `pth` was pointing to. targetname : str The name of the target pointed to by `pth` in the Group `groupname`. Raises ------ TypeError If `pth` is not of the right type. See Also -------- escape_path """ # Do conversions and possibly escapes. if isinstance(pth, bytes): p = pth.decode('utf-8') elif isinstance(pth, str): p = pth elif isinstance(pth, pathlib.PurePath): parts = pth.parts if pth.root not in ('', '/'): p = posixpath.join(*parts[1:]) else: p = posixpath.join(*parts) elif not isinstance(pth, collections.abc.Iterable): raise TypeError('p must be str, bytes, pathlib.PurePath, or ' 'an Iterable solely of one of those three.') else: # Check that all elements are unicode or bytes. if not all([isinstance(s, (bytes, str, pathlib.PurePath)) for s in pth]): raise TypeError('Elements of p must be str, bytes, or ' 'pathlib.PurePath.') # Escape (and possibly convert to str) each element and then # join them all together. parts = [None] * len(pth) for i, s in enumerate(pth): if isinstance(s, bytes): s = s.decode('utf-8') elif isinstance(s, pathlib.PurePath): s = str(s) parts[i] = escape_path(s) parts = tuple(parts) p = posixpath.join(*parts) # Remove double slashes and a non-root trailing slash. path = posixpath.normpath(p) # Extract the group name and the target name (will be a dataset if # data can be mapped to it, but will end up being made into a group # otherwise. As HDF5 files use posix path, conventions, posixpath # will do everything. groupname = posixpath.dirname(path) targetname = posixpath.basename(path) # If groupname got turned into blank, then it is just root. if len(groupname) == 0: groupname = b'/'.decode('ascii') # If targetname got turned blank, then it is the current directory. if len(targetname) == 0: targetname = b'.'.decode('ascii') return groupname, targetname