[python-hdf5storage] 85/152: Added an option to optionally convert numpy.str_ to np.uint16 in UTF-16 format and changed it so that str is converted to numpy.str_ instead of numpy.bytes_.

Mon Feb 29 08:24:37 UTC 2016

This is an automated email from the git hooks/post-receive script.

ghisvail-guest pushed a commit to annotated tag 0.1
in repository python-hdf5storage.

commit ec11484243f8b33e26de2d1c27d4a1f392806c2b
Author: Freja Nordsiek <fnordsie at gmail.com>
Date:   Sun Feb 2 22:24:06 2014 -0500

    Added an option to optionally convert numpy.str_ to np.uint16 in UTF-16 format and changed it so that str is converted to numpy.str_ instead of numpy.bytes_.
---
 README.rst                 |  88 +++++++++++++++++++---------------
 hdf5storage/Marshallers.py |  46 +++++++++++-------
 hdf5storage/__init__.py    |  50 ++++++++++++++++++-
 hdf5storage/utilities.py   | 117 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 244 insertions(+), 57 deletions(-)

diff --git a/README.rst b/README.rst
index 4584b14..a4604bd 100644
--- a/README.rst
+++ b/README.rst
@@ -62,40 +62,52 @@ will be what it is read back as) the MATLAB class it becomes if
 targetting a MAT file, and the first version of this package to
 support writing it so MATlAB can read it.
 
-=============  =======  ==================  =======  ========
-Python                                      MATLAB
-------------------------------------------  -----------------
-Type           Version  Converted to        Class    Version
-=============  =======  ==================  =======  ========
-bool           0.1      np.bool\_           logical  0.1
-None           0.1      ``np.float64([])``  ``[]``   0.1
-int            0.1      np.int64            int64    0.1
-float          0.1      np.float64          double   0.1
-complex        0.1      np.complex128       double   0.1
-str            0.1      np.bytes\_          char     0.1 [1]_
-bytes          0.1      np.bytes\_          char     0.1
-bytearray      0.1      np.bytes\_          char     0.1
-np.bool\_      0.1                          logical  0.1
-np.uint8       0.1                          uint8    0.1
+=============  =======  ====================  ===========  ========
+Python                                        MATLAB
+--------------------------------------------  ---------------------
+Type           Version  Converted to          Class        Version
+=============  =======  ====================  ===========  ========
+bool           0.1      np.bool\_/np.uint8    logical      0.1 [1]_
+None           0.1      ``np.float64([])``    ``[]``       0.1
+int            0.1      np.int64              int64        0.1
+float          0.1      np.float64            double       0.1
+complex        0.1      np.complex128         double       0.1
+str            0.1      np.uint32/16          char         0.1 [2]_
+bytes          0.1      np.bytes\_/np.uint16  char         0.1 [3]_
+bytearray      0.1      np.bytes\_/np.uint16  char         0.1 [3]_
+np.bool\_      0.1                            logical      0.1
+np.uint8       0.1                            uint8        0.1
 np.float16     0.1
-np.float32     0.1                          single   0.1
-np.float64     0.1                          double   0.1
-np.complex64   0.1                          single   0.1
-np.complex128  0.1                          double   0.1
-np.str\_       0.1      np.uint32           uint32   0.1 [2]_
-np.bytes\_     0.1                          char     0.1
-np.object\_    0.1                          cell     0.1
-dict           0.1                          struct   0.1 [3]_
-list           0.1      np.object\_         cell     0.1
-tuple          0.1      np.object\_         cell     0.1
-set            0.1      np.object\_         cell     0.1
-frozenset      0.1      np.object\_         cell     0.1
-cl.deque       0.1      np.object\_         cell     0.1
-=============  =======  ==================  =======  ========
-
-.. [1] Converted to ASCII, so characters outside of that set are lost.
-.. [2] Simply copied over as the uint32 versions of each UTF-32 character.
-.. [3] All keys must be ``str``.
+np.float32     0.1                            single       0.1
+np.float64     0.1                            double       0.1
+np.complex64   0.1                            single       0.1
+np.complex128  0.1                            double       0.1
+np.str\_       0.1      np.uint32/16          char/uint32  0.1 [2]_
+np.bytes\_     0.1      np.bytes\_/np.uint16  char         0.1 [3]_
+np.object\_    0.1                            cell         0.1
+dict           0.1                            struct       0.1 [4]_
+list           0.1      np.object\_           cell         0.1
+tuple          0.1      np.object\_           cell         0.1
+set            0.1      np.object\_           cell         0.1
+frozenset      0.1      np.object\_           cell         0.1
+cl.deque       0.1      np.object\_           cell         0.1
+=============  =======  ====================  ===========  ========
+
+.. [1] Depends on the selected options. Always ``np.uint8`` when doing
+       MATLAB compatiblity, or if the option is explicitly set.
+.. [2] Depends on the selected options and whether it can be converted
+       to UTF-16 without using doublets. If the option is explicity set
+       (or implicitly through doing MATLAB compatibility) and it can be
+       converted to UTF-16 without losing any characters that can't be
+       represented in UTF-16 or using UTF-16 doublets (MATLAB doesn't
+       support them), then it is written as ``np.uint16`` in UTF-16
+       encoding. Otherwise, it is stored at ``np.uint32`` in UTF-32
+       encoding.
+.. [3] Depends on the selected options. If the option is explicitly set
+       (or implicitly through doing MATLAB compatibility), it will be
+       stored as ``np.uint16`` in UTF-16 encoding. Otherwise, it is just
+       written as ``np.bytes_``.
+.. [4] All keys must be ``str``.
 
 This table gives the MATLAB classes that can be read from a MAT file,
 the first version of this package that can read them, and the Python
@@ -105,8 +117,8 @@ type they are read as.
 MATLAB Class  Version  Python Type
 ============  =======  ================================
 logical       0.1      np.bool\_
-single        0.1      np.float32 or np.complex64 [4]_
-double        0.1      np.float64 or np.complex128 [4]_
+single        0.1      np.float32 or np.complex64 [5]_
+double        0.1      np.float64 or np.complex128 [5]_
 uint8         0.1      np.uint8
 uint16        0.1      np.uint16
 uint32        0.1      np.uint32
@@ -115,9 +127,9 @@ int8          0.1      np.int8
 int16         0.1      np.int16
 int32         0.1      np.int32
 int64         0.1      np.int64
-struct        0.1      dict [5]_
+struct        0.1      dict [6]_
 cell          0.1      np.object\_
 ============  =======  ================================
 
-.. [4] Depends on whether there is a complex part or not.
-.. [5] Structure arrays are not supported.
+.. [5] Depends on whether there is a complex part or not.
+.. [6] Structure arrays are not supported.
diff --git a/hdf5storage/Marshallers.py b/hdf5storage/Marshallers.py
index caf6ce1..be109e4 100644
--- a/hdf5storage/Marshallers.py
+++ b/hdf5storage/Marshallers.py
@@ -390,21 +390,33 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
                 data_to_store = np.uint16(np.atleast_1d( \
                     data_to_store).view(np.uint8))
 
-        # As of 2013-12-13, h5py cannot write numpy.unicode (UTF-32
-        # encoding) types. If it is just a numpy.unicode object, we can
-        # force it to UTF-16 or just write it as uint32's. If it is an
-        # array, forcing it to UTF-16 is a bad idea because characters
-        # are not always 2 bytes long in UTF-16. So, converting them to
-        # uint32 makes the most sense.
+        # As of 2013-12-13, h5py cannot write numpy.str_ (UTF-32
+        # encoding) types. If the option is set to try to convert them
+        # to UTF-16, then an attempt at the conversion is made. If no
+        # conversion is to be done, the conversion throws an exception
+        # (a UTF-32 character had no UTF-16 equivalent), or a UTF-32
+        # character gets turned into a UTF-16 doublet (the increase in
+        # the number of columns will be by a factor more than the length
+        # of the strings); then it will be simply converted to uint32's
+        # byte for byte instead.
 
         if data.dtype.type == np.str_:
-            if data_to_store.nbytes == 0:
-                data_to_store = np.uint32([])
+            new_data = None
+            if options.convert_numpy_str_to_utf16:
+                try:
+                    new_data = convert_numpy_str_to_uint16( \
+                        data_to_store)
+                except:
+                    pass
+            if new_data is None or (type(data_to_store) == np.str_ \
+                    and len(data_to_store) == len(new_data)) \
+                    or (isinstance(data_to_store, np.ndarray) \
+                    and new_data.shape[-1] != data_to_store.shape[-1] \
+                    * (data_to_store.dtype.itemsize//4)):
+                data_to_store = convert_numpy_str_to_uint32( \
+                    data_to_store)
             else:
-                shape = list(np.atleast_1d(data_to_store).shape)
-                shape[-1] *= data_to_store.dtype.itemsize//4
-                data_to_store = data_to_store.flatten().view(np.uint32)
-                data_to_store = data_to_store.reshape(tuple(shape))
+                data_to_store = new_data
 
         # Convert scalars to arrays if that option is set.
 
@@ -552,9 +564,10 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
 
         # If we are making it MATLAB compatible, the MATLAB_class
         # attribute needs to be set looking up the data type (gotten
-        # using np.dtype.type) and if it is a string or bool type, then
-        # the MATLAB_int_decode attribute must be set
-        # properly. Otherwise, the attributes must be deleted.
+        # using np.dtype.type). If it is a string or bool type, then
+        # the MATLAB_int_decode attribute must be set to the number of
+        # bytes each element takes up (dtype.itemsize). Otherwise,
+        # the attributes must be deleted.
 
         if options.matlab_compatible:
             tp = data.dtype.type
@@ -566,8 +579,7 @@ class NumpyScalarArrayMarshaller(TypeMarshaller):
 
             if tp in (np.bytes_, np.str_, np.bool_):
                 set_attribute(grp[name], 'MATLAB_int_decode', np.int64(
-                              {np.bool_: 1, np.bytes_: 2,
-                              np.str_: 4}[tp]))
+                              grp[name].dtype.itemsize))
             else:
                 del_attribute(grp[name], 'MATLAB_int_decode')
 
diff --git a/hdf5storage/__init__.py b/hdf5storage/__init__.py
index de228b1..8b3cec5 100644
--- a/hdf5storage/__init__.py
+++ b/hdf5storage/__init__.py
@@ -203,6 +203,7 @@ class Options(object):
     delete_unused_variables       ``True``
     convert_scalars_to_arrays     ``True``
     convert_numpy_bytes_to_utf16  ``True``
+    convert_numpy_str_to_utf16    ``True``
     convert_bools_to_uint8        ``True``
     reverse_dimension_order       ``True``
     store_shape_for_empty         ``True``
@@ -226,6 +227,8 @@ class Options(object):
         See Attributes.
     convert_numpy_bytes_to_utf16 : bool, optional
         See Attributes.
+    convert_numpy_str_to_utf16 : bool, optional
+        See Attributes.
     convert_bools_to_uint8 : bool, optional
         See Attributes.
     reverse_dimension_order : bool, optional
@@ -246,6 +249,7 @@ class Options(object):
     delete_unused_variables : bool
     convert_scalars_to_arrays : bool
     convert_numpy_bytes_to_utf16 : bool
+    convert_numpy_str_to_utf16 : bool
     convert_bools_to_uint8 : bool
     reverse_dimension_order : bool
     store_shape_for_empty : bool
@@ -264,6 +268,7 @@ class Options(object):
                  delete_unused_variables=False,
                  convert_scalars_to_arrays=False,
                  convert_numpy_bytes_to_utf16=False,
+                 convert_numpy_str_to_utf16=False,
                  convert_bools_to_uint8=False,
                  reverse_dimension_order=False,
                  store_shape_for_empty=False,
@@ -276,6 +281,7 @@ class Options(object):
         self._delete_unused_variables = False
         self._convert_scalars_to_arrays = False
         self._convert_numpy_bytes_to_utf16 = False
+        self._convert_numpy_str_to_utf16 = False
         self._convert_bools_to_uint8 = False
         self._reverse_dimension_order = False
         self._store_shape_for_empty = False
@@ -291,6 +297,7 @@ class Options(object):
         self.delete_unused_variables = delete_unused_variables
         self.convert_scalars_to_arrays = convert_scalars_to_arrays
         self.convert_numpy_bytes_to_utf16 = convert_numpy_bytes_to_utf16
+        self.convert_numpy_str_to_utf16 = convert_numpy_str_to_utf16
         self.convert_bools_to_uint8 = convert_bools_to_uint8
         self.reverse_dimension_order = reverse_dimension_order
         self.store_shape_for_empty = store_shape_for_empty
@@ -355,6 +362,7 @@ class Options(object):
         delete_unused_variables       ``True``
         convert_scalars_to_arrays     ``True``
         convert_numpy_bytes_to_utf16  ``True``
+        convert_numpy_str_to_utf16    ``True``
         convert_bools_to_uint8        ``True``
         reverse_dimension_order       ``True``
         store_shape_for_empty         ``True``
@@ -379,6 +387,7 @@ class Options(object):
                 self._delete_unused_variables = True
                 self._convert_scalars_to_arrays = True
                 self._convert_numpy_bytes_to_utf16 = True
+                self._convert_numpy_str_to_utf16 = True
                 self._convert_bools_to_uint8 = True
                 self._reverse_dimension_order = True
                 self._store_shape_for_empty = True
@@ -442,8 +451,8 @@ class Options(object):
 
         If ``True`` (defaults to ``False`` unless MATLAB compatibility
         is being done), ``numpy.bytes_`` and anything that is converted
-        to them (``str``, ``bytes``, and ``bytearray``) are converted to
-        UTF-16 before being written to file as ``numpy.uint16``.
+        to them (``bytes``, and ``bytearray``) are converted to UTF-16
+        before being written to file as ``numpy.uint16``.
 
         Must be ``True`` if doing MATLAB compatibility. MATLAB uses
         UTF-16 for its strings.
@@ -451,6 +460,7 @@ class Options(object):
         See Also
         --------
         numpy.bytes_
+        convert_numpy_str_to_utf16
 
         """
         return self._convert_numpy_bytes_to_utf16
@@ -465,6 +475,42 @@ class Options(object):
             self._matlab_compatible = False
 
     @property
+    def convert_numpy_str_to_utf16(self):
+        """ Whether or not to convert numpy.str_ to UTF-16.
+
+        bool
+
+        If ``True`` (defaults to ``False`` unless MATLAB compatibility
+        is being done), ``numpy.str_`` and anything that is converted
+        to them (``str``) will be converted to UTF-16 if possible before
+        being written to file as ``numpy.uint16``. If doing so would
+        lead to a loss of data (character can't be translated to
+        UTF-16) or would change the shape of an array of ``numpy.str_``
+        due to a character being converted into a pair 2-bytes, the
+        conversion will not be made and the string will be stored in
+        UTF-32 form as a ``numpy.uint32``.
+
+        Must be ``True`` if doing MATLAB compatibility. MATLAB uses
+        UTF-16 for its strings.
+
+        See Also
+        --------
+        numpy.bytes_
+        convert_numpy_str_to_utf16
+
+        """
+        return self._convert_numpy_str_to_utf16
+
+    @convert_numpy_str_to_utf16.setter
+    def convert_numpy_str_to_utf16(self, value):
+        # Check that it is a bool, and then set it. If it is false, we
+        # are not doing MATLAB compatible formatting.
+        if isinstance(value, bool):
+            self._convert_numpy_str_to_utf16 = value
+        if not self._convert_numpy_str_to_utf16:
+            self._matlab_compatible = False
+
+    @property
     def convert_bools_to_uint8(self):
         """ Whether or not to convert bools to ``numpy.uint8``.
 
diff --git a/hdf5storage/utilities.py b/hdf5storage/utilities.py
index 3ddb125..d9d635e 100644
--- a/hdf5storage/utilities.py
+++ b/hdf5storage/utilities.py
@@ -65,6 +65,123 @@ def next_unused_name_in_group(grp, length):
         if name not in existing_names:
             return name
 
+def convert_numpy_str_to_uint16(data):
+    """ Converts a numpy.str_ to UTF-16 encoding in numpy.uint16 form.
+
+    Convert a ``numpy.str`` or an array of them (they are UTF-32
+    strings) to UTF-16 in the equivalent array of ``numpy.uint16``. The
+    conversion will throw an exception if any characters cannot be
+    converted to UTF-16. Strings are expanded along rows (across columns)
+    so a 2x3x4 array of 10 element strings will get turned into a 2x30x4
+    array of uint16's if every UTF-32 character converts easily to a
+    UTF-16 singlet, as opposed to a UTF-16 doublet.
+
+    Parameters
+    ----------
+    data : numpy.str_ or numpy.ndarray of numpy.str_
+        The string or array of them to convert.
+
+    Returns
+    -------
+    numpy.ndarray of numpy.uint16
+        The result of the conversion.
+
+    Raises
+    ------
+    UnicodeEncodeError
+        If a UTF-32 character has no UTF-16 representation.
+
+    See Also
+    --------
+    convert_numpy_str_to_uint32
+    decode_to_numpy_unicode
+
+    """
+    # An empty string should be an empty uint16
+    if data.nbytes == 0:
+        return np.uint16([])
+
+    # If it is just a string instead of an array of them, then the
+    # string can simply be converted and returned as a 1d array pretty
+    # easily using ndarray's buffer option. The byte order mark, 2
+    # bytes, needs to be removed.
+    if not isinstance(data, np.ndarray):
+        s = data.encode(encoding='UTF-16', errors='strict')
+        return np.ndarray(shape=((len(s)-2)//2,), dtype='uint16',
+                          buffer=s[2:])
+
+    # It is an array of strings. Each string in the array needs to be
+    # converted. An object array is needed to hold all the converted
+    # forms, as opposed to just constructing the final uint16 array,
+    # because the converted forms could end up greatly differing lengths
+    # depending on how many characters turn into doublets. The sizes of
+    # each one need to be grabbed along the way to be able to construct
+    # the final array. The easiest way to convert each string is to use
+    # recursion.
+    converted_strings = np.ndarray(shape=data.shape, dtype='object')
+    sizes = np.zeros(shape=data.shape, dtype='int64')
+
+    for index, x in np.ndenumerate(data):
+        converted_strings[index] = convert_numpy_str_to_uint16(x)
+        sizes[index] = np.prod(converted_strings[index].shape)
+
+    # The shape of the new array is simply the shape of the old one with
+    # the number of columns increased multiplicatively by the size of
+    # the largest UTF-16 string so that everything will fit.
+    length = np.max(sizes)
+    shape = list(data.shape)
+    shape[-1] *= length
+    new_data = np.zeros(shape=tuple(shape), dtype='uint16')
+
+    # Copy each string into new_data using clever indexing (using the
+    # first part of index returns a 1d subarray that can be
+    # addressed). Then, the conversion is done.
+    for index, x in np.ndenumerate(converted_strings):
+        new_data[index[:-1]][ \
+            (length*index[-1]):(length*index[-1]+sizes[index])] = x
+
+    return new_data
+
+def convert_numpy_str_to_uint32(data):
+    """ Converts a numpy.str_ to its numpy.uint32 representation.
+
+    Convert a ``numpy.str`` or an array of them (they are UTF-32
+    strings) into the equivalent array of ``numpy.uint32`` that is byte
+    for byte identical. Strings are expanded along rows (across columns)
+    so a 2x3x4 array of 10 element strings will get turned into a 2x30x4
+    array of uint32's.
+
+    Parameters
+    ----------
+    data : numpy.str_ or numpy.ndarray of numpy.str_
+        The string or array of them to convert.
+
+    Returns
+    -------
+    numpy.ndarray of numpy.uint32
+        The result of the conversion.
+
+    See Also
+    --------
+    convert_numpy_str_to_uint16
+    decode_to_numpy_unicode
+
+    """
+    if data.nbytes == 0:
+        # An empty string should be an empty uint32.
+        return np.uint32([])
+    else:
+        # We need to calculate the new shape from the current shape,
+        # which will have to be expanded along the rows to fit all the
+        # characters (the dtype.itemsize gets the number of bytes in
+        # each string, which is just 4 times the number of
+        # characters. Then it is a mstter of getting a view of the
+        # string (in flattened form so that it is contiguous) as uint32
+        # and then reshaping it.
+        shape = list(np.atleast_1d(data).shape)
+        shape[-1] *= data.dtype.itemsize//4
+        return data.flatten().view(np.uint32).reshape(tuple(shape))
+
 def decode_to_str(data):
     """ Decodes data to the Python str type.
 

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-science/packages/python-hdf5storage.git